taichi-dev
diff --git a/‎.github/workflows/testing.yml
+1-1 b/‎.github/workflows/testing.yml
+1-1
diff --git a/‎cmake/TaichiExportCore.cmake
+2 b/‎cmake/TaichiExportCore.cmake
+2
diff --git a/‎docs/lang/articles/advanced/performance.md
+7-3 b/‎docs/lang/articles/advanced/performance.md
+7-3
diff --git a/‎python/taichi/ui/staging_buffer.py
+1-1 b/‎python/taichi/ui/staging_buffer.py
+1-1
@@ -450,7 +450,7 @@ jobs:
           fi
           docker create --user dev --name taichi_build_desktop --gpus all -v /tmp/.X11-unix:/tmp/.X11-unix \
             -e PY -e GPU_BUILD -e PROJECT_NAME -e TAICHI_CMAKE_ARGS -e DISPLAY -e EXPORT_CORE\
-            registry.taichigraphics.com/taichidev-ubuntu18.04:v0.2.1 \
+            registry.taichigraphics.com/taichidev-ubuntu18.04:v0.3.0 \
             /home/dev/taichi/.github/workflows/scripts/unix_build.sh
           # A tarball is needed because sccache needs some permissions that only the file owner has.
           # 1000 is the uid and gid of user "dev" in the container.
 
@@ -4,3 +4,5 @@ set(TAICHI_EXPORT_CORE_NAME taichi_export_core)
 
 add_library(${TAICHI_EXPORT_CORE_NAME} SHARED)
 target_link_libraries(${TAICHI_EXPORT_CORE_NAME} PRIVATE taichi_isolated_core)
+set_target_properties(${TAICHI_EXPORT_CORE_NAME} PROPERTIES
+    CMAKE_LIBRARY_OUTPUT_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/build")
@@ -153,14 +153,18 @@ Additionally, the last atomic add to the global memory `s[None]` is optimized us
 CUDA's warp-level intrinsics, further reducing the number of required atomic adds.
 
 Currently, Taichi supports TLS optimization for these reduction operators: `add`,
-`sub`, `min` and `max`. [Here](https://github.com/taichi-dev/taichi/pull/2956) is
-a benchmark comparison when running a global max reduction on a 1-D Taichi field
+`sub`, `min` and `max` on **0D** scalar/vector/matrix `ti.field`s. It is not yet
+supported on `ti.ndarray`s. [Here](https://github.com/taichi-dev/taichi/pull/2956)
+is a benchmark comparison when running a global max reduction on a 1-D Taichi field
 of 8M floats on an Nvidia GeForce RTX 3090 card:
 
 * TLS disabled: 5.2 x 1e3 us
 * TLS enabled: 5.7 x 1e1 us
 
-TLS has led to an approximately 100x speedup.
+TLS has led to an approximately 100x speedup. We also show that TLS reduction sum
+achieves comparable performance with CUDA implementations, see
+[benchmark](https://github.com/taichi-dev/taichi_benchmark/tree/main/reduce_sum) for
+details.
 
 ### Block Local Storage (BLS)
 
 
@@ -100,7 +100,7 @@ def copy_image_u8_to_u8(src: ti.template(), dst: ti.template(),
                         num_components: ti.template()):
     for i, j in src:
         for k in ti.static(range(num_components)):
-            dst[i, j][k] = src[i, j][k]
+            dst[i, j][k] = ti.cast(src[i, j][k], ti.u8)
         if num_components < 4:
             # alpha channel
             dst[i, j][3] = u8(255)