stan-dev
diff --git a/‎.github/ISSUE_TEMPLATE.md
+1-1 b/‎.github/ISSUE_TEMPLATE.md
+1-1
diff --git a/‎RELEASE-NOTES.txt
+59 b/‎RELEASE-NOTES.txt
+59
diff --git a/‎doxygen/doxygen.cfg
+1-1 b/‎doxygen/doxygen.cfg
+1-1
diff --git a/‎make/tests
+1-1 b/‎make/tests
+1-1
diff --git a/‎stan/math/fwd/fun/mdivide_left.hpp
-1 b/‎stan/math/fwd/fun/mdivide_left.hpp
-1
diff --git a/‎stan/math/fwd/fun/mdivide_left_tri_low.hpp
-1 b/‎stan/math/fwd/fun/mdivide_left_tri_low.hpp
-1
diff --git a/‎stan/math/opencl/kernel_generator/block_zero_based.hpp
+12-2 b/‎stan/math/opencl/kernel_generator/block_zero_based.hpp
+12-2
diff --git a/‎stan/math/opencl/kernel_generator/elt_function_cl.hpp
+1 b/‎stan/math/opencl/kernel_generator/elt_function_cl.hpp
+1
diff --git a/‎stan/math/opencl/kernels/cumulative_sum.hpp
+224 b/‎stan/math/opencl/kernels/cumulative_sum.hpp
+224
diff --git a/‎stan/math/opencl/prim.hpp
+7 b/‎stan/math/opencl/prim.hpp
+7
@@ -25,4 +25,4 @@ If this is a **feature request**, show what you expect to happen if the feature
 
 
 #### Current Version:
-v4.0.1
+v4.1.0
@@ -1,5 +1,64 @@
 Stan Math Library Release Notes
 
+======================================================================
+v4.1.0 (2 June 2021)
+======================================================================
+
+ - Added the Cash-Karp numerical integrator to improve numerical integration of ODEs with semi-stiffness and/or rapid oscillations.(#2336)
+ - Added the quantile function.(#2398)
+ - Added custom reverse mode for diag_pre_multiply() and diag_post_multiply() functions.(#2405, #2453)
+ - Optimized `multi_normal_cholesky` for non-autodiff covariance. (#2439)
+ - Updated Sundials to 5.7.0.(#2441)
+ - Improved memory safety of nested paralellism.(#2445)
+ - Updated TBB to 2020.3.(#2447)
+ - Added the `STAN_NO_RANGE_CHECKS` macro which turns off bounds and range checks.(#2423, #2437)
+ - Optimized `gp_*_cov` functions, especially for large amount of data.(#2464)
+ - Fixed compilation errors when using `unsigned` and `long` types with `apply_scalar_unary`.(#2469)
+ - Added the implementation of the loglogistic probability density function.(#2477)
+ - Adds reverse mode specialization for `csr_matrix_times_vector(sparse data, dense parameter).(#2462)
+ - Allow tbb init to set the number of threads by an argument.(#2455)
+ - Fixed a bug with expressions in poisson distribution functions.(#2414)
+ - Fixed the off by one error in set_zero_all_adjoints_nested.(#2399)
+ - Fixed bug with printing Eigen expressions.(#2436)
+ - Refactored operands and partials to avoid extra allocations.(#2418)
+ - Tidied up distributions C++ code.(#2352)
+ - Updated the integrate_1d internal interface updated in preparation for closures(#2397)
+ - Added docs for new contributors with a getting started guide and docs for contributing new distributions.(#2350, #2466)
+ - Added an ODE testing framework.(#2432)
+ - Replaced the finite difference approximation of the Hessian from one that is based on function calls to one that is based on gradients.(#2348)
+ - Updated code generation for expression tests.(#2419)
+ - Fixed a bug in expression tests and benchmark generation, where downloading `stanc.exe` did not work on Windows.(#2480)
+- Varmat:
+    - Add `rep_*` utility functions for new matrix type(#2358)
+    - `var<Matrix>` overloads for digamma, distance, Phi, inv_Phi, Phi_approx, sqrt, tail, tgamma, rows_dot_self, fma, offset_multiplier, bessel first and second kind, beta, binary log loss, ceil, erf, erfc, exp2, expm1, falling_factorial and floor (#2362, #2378, #2396, #2461)
+    - Added lb/ub/lub_constrain specializations.(#2373, #2382, #2387, #2379)
+    - Added script to automatically check stanc3 signatures for varmat compatibility.(#2434)
+- OpenCL:
+    - Fixed OpenCL implementations of distributions mostly not working with row vectors.(#2360)
+    - Added prim and rev OpenCL implementations for `to_matrix`, `to_vector`, `to_row_vector`, `to_array_1d`, `to_array_2d`, `append_array`, `reverse`, `symmetrize_from_lower_tri`, `symmetrize_from_upper_tri` `trace`.(#2377, #2383, #2388)
+    - Added OpenCL functions `rep_matrix`, `rep_vector`, `rep_row_vector`, `rep_array` and `identity_matrix`.(#2388)
+    - Added operator %.()
+    - Reorganized how work is distributed between threads in generated kernels that use colwise reductions (including all distributions), significantly improving GPU preformance.(#2392)
+    - Removed `.triangularTranspose()` member funtion from `matrix_cl` and `TriangularMapCL` enum. `.triangularTranspose()` is replaced by `symmetrize_from_lower_tri()`.(#2393)
+    - Added support for two dimensional reductions to kernel generator.(#2403)
+    - Added OpenCL implementations for functions `log_mix`, `log_softmax`, `log_sum_exp`, `rank`, `sd`, `softmax` and `˙variance`.(#2426)
+    - Added OpenCL implementations for `ub_constrain`, `lb_constrain`, `lub_constrain`, `offset_multiplier_constrain` and `unit_vector_constrain`.(#2427)
+    - Added OpenCL implementation for `prod` function and kernel generator operation for rowwise, colwise and 2d product.(#2433)
+    - Added OpenCL implementations for functions: `bernoulli_cdf`, `bernoulli_lcdf`, `bernoulli_lccdf`, `cauchy_cdf`, `cauchy_lcdf`, `cauchy_lccdf`.(#2446)
+    - Added OpenCL implementations for functions `double_exponential_cdf`, `double exponential_lcd`, `double_exponential_lccdf`˙, `exp_mod_normal_cdf`, `exp_mod_normal_lcdf` and `exp_mod_normal_lccdf`.(#2449)
+    - Added OpenCL implementations for functions `exponential_cdf`, `exponential_lcdf`, `exponential_lccdf`, `frechet_cdf`, `frechet_lcdf` and `frechet_lccdf`.(#2450)
+    - Added OpenCL implementations for functions `gumbel_cdf`, `gumbel_lcdf`, `gumbel_lccdf`, `logistic_cdf`, `logistic_lcdf` and `logistic_lccdf`.(#2451)
+    - Added a new kernel generator operation that allows writing custom OpenCL code.(#2454)
+    - Added OpenCL implementations for functions `pareto_cdf`, `pareto_lccdf`, `pareto_lcdf`, `pareto_type_2_cdf`, `pareto_type_2_lccdf`, and `pareto_type_2_lcdf`.(#2456)
+    - Added OpenCL implementations for functions: `rayleigh_cdf`, `rayleigh_lccdf`, `rayleigh_lcdf`, `skew_double_exponential_cdf`, `skew_double_exponential_lccdf`, `skew_double_exponential_lcdf` and `skew_double_exponential_lpdf`.(#2457)
+    - Added OpenCL implementations for functions `lognormal_cdf`, `lognormal_lccdf`, `lognormal_lcdf`, `normal_cdf`, `normal_lccdf`, `normal_lcdf`.(#2458)
+    - Added OpenCL implementations for functions `std_normal_cdf`, `std_normal_lccdf`, `std_normal_lcdf`, `uniform_cdf`, `uniform_lccdf` and `uniform_lcdf`.(#2459)
+    - Added OpenCL implementations for functions `weibull_cdf`, `weibull_lccdf` and `weibull_lcdf`.(#2460)
+    - Removed unused OpenCL kernels and checks.(#2463)
+    - Added OpenCL prim implementation for functions: `gp_exponential_cov`, `gp_matern32_cov`, `matern_52_cov` and both prim and rev implementation for `gp_dot_prod_cov`.(#2471)
+    - Added reference (`ref_type`) for kernel generator expressions.(#2404)
+    - Added typecast operation to kernel generator.(#2472)
+
 ======================================================================
 v4.0.1 (17 February 2021)
 ======================================================================
 
@@ -38,7 +38,7 @@ PROJECT_NAME           = "Stan Math Library"
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = 4.0.1
+PROJECT_NUMBER         = 4.1.0
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
 
@@ -100,7 +100,7 @@ else
 endif
 
 %.hpp-test : %.hpp test/dummy.cpp
-	$(COMPILE.cpp) $(CXXFLAGS) -O0 -include $^ -o $(DEV_NULL)
+	$(COMPILE.cpp) $(CXXFLAGS) -O0 -include $^ -o $(DEV_NULL) -Wunused-local-typedefs
 
 test/dummy.cpp:
 	@mkdir -p test
 
@@ -60,7 +60,6 @@ template <typename T1, typename T2,
 inline Eigen::Matrix<value_type_t<T2>, T1::RowsAtCompileTime,
                      T2::ColsAtCompileTime>
 mdivide_left(const T1& A, const T2& b) {
-  using T = typename value_type_t<T2>::Scalar;
   constexpr int S1 = T1::RowsAtCompileTime;
   constexpr int C2 = T2::ColsAtCompileTime;
 
 
@@ -55,7 +55,6 @@ template <typename T1, typename T2, require_eigen_t<T1>* = nullptr,
 inline Eigen::Matrix<value_type_t<T2>, T1::RowsAtCompileTime,
                      T2::ColsAtCompileTime>
 mdivide_left_tri_low(const T1& A, const T2& b) {
-  using T = typename value_type_t<T2>::Scalar;
   constexpr int S1 = T1::RowsAtCompileTime;
   constexpr int C2 = T2::ColsAtCompileTime;
 
 
@@ -272,9 +272,19 @@ class block_
   inline void set_view(int bottom_diagonal, int top_diagonal,
                        int bottom_zero_diagonal, int top_zero_diagonal) const {
     int change = start_col_ - start_row_;
-    this->template get_arg<0>().set_view(
+    auto& a = this->template get_arg<0>();
+    a.set_view(
         bottom_diagonal + change, top_diagonal + change,
-        bottom_zero_diagonal + change, top_zero_diagonal + change);
+        (start_col_ == 0 && start_row_ <= 1 && start_row_ + rows_ == a.rows()
+                 && start_col_ + cols_ >= std::min(a.rows() - 1, a.cols())
+             ? bottom_zero_diagonal
+             : bottom_diagonal)
+            + change,
+        (start_row_ == 0 && start_col_ <= 1 && start_col_ + cols_ == a.cols()
+                 && start_row_ + rows_ >= std::min(a.rows(), a.cols() - 1)
+             ? top_zero_diagonal
+             : top_diagonal)
+            + change);
   }
 
   /**
 
@@ -338,6 +338,7 @@ ADD_BINARY_FUNCTION_WITH_INCLUDES(fmod)
 ADD_BINARY_FUNCTION_WITH_INCLUDES(hypot)
 ADD_BINARY_FUNCTION_WITH_INCLUDES(ldexp)
 ADD_BINARY_FUNCTION_WITH_INCLUDES(pow)
+ADD_BINARY_FUNCTION_WITH_INCLUDES(copysign)
 
 ADD_BINARY_FUNCTION_WITH_INCLUDES(
     beta, stan::math::opencl_kernels::beta_device_function)
 
@@ -0,0 +1,224 @@
+#ifndef STAN_MATH_OPENCL_KERNELS_CUMULATIVE_SUM_HPP
+#define STAN_MATH_OPENCL_KERNELS_CUMULATIVE_SUM_HPP
+#ifdef STAN_OPENCL
+
+#include <stan/math/opencl/kernel_cl.hpp>
+#include <stan/math/opencl/buffer_types.hpp>
+#include <stan/math/opencl/matrix_cl_view.hpp>
+#include <string>
+
+namespace stan {
+namespace math {
+namespace opencl_kernels {
+
+// \cond
+static const char *cumulative_sum1_kernel_code = STRINGIFY(
+    // \endcond
+    /** \ingroup opencl_kernels
+     * First kernel of the cumulative sum implementation. Each thread sums the
+     * assigned elements and threads within same work group add their results
+     * together.
+     *
+     * @param[out] out_wgs results from each work group
+     * @param[out] out_threads results for each thread
+     * @param[in] in input data
+     * @param size size number of elements in the input
+     */
+    __kernel void cumulative_sum1(__global SCAL *out_wgs,
+                                  __global SCAL *out_threads, __global SCAL *in,
+                                  int size) {
+      const int gid = get_global_id(0);
+      const int lid = get_local_id(0);
+      const int lsize = get_local_size(0);
+      const int wg_id = get_group_id(0);
+      const int gsize = get_global_size(0);
+
+      int start = (int)((long)gid * size / gsize);      // NOLINT
+      int end = (int)((long)(gid + 1) * size / gsize);  // NOLINT
+      __local SCAL local_storage[LOCAL_SIZE_];
+
+      SCAL acc = 0;
+      if (start != end) {
+        acc = in[start];
+        for (int i = start + 1; i < end; i++) {
+          acc += in[i];
+        }
+      }
+      for (int step = 1; step < lsize; step *= REDUCTION_STEP_SIZE) {
+        local_storage[lid] = acc;
+        barrier(CLK_LOCAL_MEM_FENCE);
+        for (int i = 1; i < REDUCTION_STEP_SIZE && step * i <= lid; i++) {
+          acc += local_storage[lid - step * i];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+      }
+      out_threads[gid] = acc;
+      if (lid == LOCAL_SIZE_ - 1) {
+        out_wgs[wg_id] = acc;
+      }
+    }
+    // \cond
+);
+// \endcond
+
+// \cond
+static const char *cumulative_sum2_kernel_code = STRINGIFY(
+    // \endcond
+    /** \ingroup opencl_kernels
+     * Second kernel of the cumulative sum implementation. Calculates prefix sum
+     * of given data in place using a single work group (must be run with a
+     * single work group).
+     *
+     * @param[in, out] data data to calculate cumulative sum of
+     * @param size size number of elements in the input
+     */
+    __kernel void cumulative_sum2(__global SCAL *data, int size) {
+      const int gid = get_global_id(0);
+      const int gsize = get_global_size(0);
+
+      int start = (int)((long)gid * size / gsize);      // NOLINT
+      int end = (int)((long)(gid + 1) * size / gsize);  // NOLINT
+      __local SCAL local_storage[LOCAL_SIZE_];
+
+      SCAL acc;
+      if (start == end) {
+        acc = 0;
+      } else {
+        acc = data[start];
+        for (int i = start + 1; i < end; i++) {
+          acc += data[i];
+        }
+      }
+      local_storage[gid] = acc;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      for (int step = 1; step < gsize; step *= REDUCTION_STEP_SIZE) {
+        for (int i = 1; i < REDUCTION_STEP_SIZE && step * i <= gid; i++) {
+          acc += local_storage[gid - step * i];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+        local_storage[gid] = acc;
+        barrier(CLK_LOCAL_MEM_FENCE);
+      }
+      if (start != end) {
+        if (gid == 0) {
+          acc = 0;
+        } else {
+          acc = local_storage[gid - 1];
+        }
+        for (int i = start; i < end; i++) {
+          acc += data[i];
+          data[i] = acc;
+        }
+      }
+    }
+    // \cond
+);
+// \endcond
+
+// \cond
+static const char *cumulative_sum3_kernel_code = STRINGIFY(
+    // \endcond
+    /** \ingroup opencl_kernels
+     * Third kernel of the cumulative sum implementation. Given sums of threads
+     * and cumulative sum of those calculates cumulative sum of given array.
+     * Must be run with the same number of threads and work groups as the first
+     * cumulative sum kernel.
+     *
+     * @param[out] out cumulatively summed input
+     * @param[out] in_data input data
+     * @param[in] in_threads summed results from each thread from the first
+     * kernel
+     * @param[in] in_wgs cumulatively summed results from each work group
+     * (calculated by previous two kernels)
+     * @param size size number of elements in the input
+     */
+    __kernel void cumulative_sum3(__global SCAL *out, __global SCAL *in_data,
+                                  __global SCAL *in_threads,
+                                  __global SCAL *in_wgs, int size) {
+      const int gid = get_global_id(0);
+      const int lid = get_local_id(0);
+      const int lsize = get_local_size(0);
+      const int wg_id = get_group_id(0);
+      const int gsize = get_global_size(0);
+
+      int start = (int)((long)gid * size / gsize);      // NOLINT
+      int end = (int)((long)(gid + 1) * size / gsize);  // NOLINT
+      __local SCAL local_storage[LOCAL_SIZE_];
+
+      SCAL acc = 0;
+      if (wg_id != 0) {
+        acc = in_wgs[wg_id - 1];
+      }
+      if (lid != 0) {
+        acc += in_threads[gid - 1];
+      }
+      for (int i = start; i < end; i++) {
+        acc += in_data[i];
+        out[i] = acc;
+      }
+    }
+    // \cond
+);
+// \endcond
+
+/**
+ * struct containing cumulative_sum kernels, grouped by scalar type.
+ */
+template <typename Scalar, typename = void>
+struct cumulative_sum {};
+
+template <typename T>
+struct cumulative_sum<double, T> {
+  static const kernel_cl<out_buffer, out_buffer, in_buffer, int> kernel1;
+  static const kernel_cl<in_out_buffer, int> kernel2;
+  static const kernel_cl<out_buffer, in_buffer, in_buffer, in_buffer, int>
+      kernel3;
+};
+template <typename T>
+struct cumulative_sum<int, T> {
+  static const kernel_cl<out_buffer, out_buffer, in_buffer, int> kernel1;
+  static const kernel_cl<in_out_buffer, int> kernel2;
+  static const kernel_cl<out_buffer, in_buffer, in_buffer, in_buffer, int>
+      kernel3;
+};
+
+template <typename T>
+const kernel_cl<out_buffer, out_buffer, in_buffer, int>
+    cumulative_sum<double, T>::kernel1("cumulative_sum1",
+                                       {"#define SCAL double\n",
+                                        cumulative_sum1_kernel_code},
+                                       {{"REDUCTION_STEP_SIZE", 4},
+                                        {"LOCAL_SIZE_", 16}});
+template <typename T>
+const kernel_cl<out_buffer, out_buffer, in_buffer, int>
+    cumulative_sum<int, T>::kernel1(
+        "cumulative_sum1", {"#define SCAL int\n", cumulative_sum1_kernel_code},
+        {{"REDUCTION_STEP_SIZE", 4}, {"LOCAL_SIZE_", 16}});
+
+template <typename T>
+const kernel_cl<in_out_buffer, int> cumulative_sum<double, T>::kernel2(
+    "cumulative_sum2", {"#define SCAL double\n", cumulative_sum2_kernel_code},
+    {{"REDUCTION_STEP_SIZE", 4}, {"LOCAL_SIZE_", 1024}});
+template <typename T>
+const kernel_cl<in_out_buffer, int> cumulative_sum<int, T>::kernel2(
+    "cumulative_sum2", {"#define SCAL int\n", cumulative_sum2_kernel_code},
+    {{"REDUCTION_STEP_SIZE", 4}, {"LOCAL_SIZE_", 1024}});
+
+template <typename T>
+const kernel_cl<out_buffer, in_buffer, in_buffer, in_buffer, int>
+    cumulative_sum<double, T>::kernel3("cumulative_sum3",
+                                       {"#define SCAL double\n",
+                                        cumulative_sum3_kernel_code},
+                                       {{"REDUCTION_STEP_SIZE", 4},
+                                        {"LOCAL_SIZE_", 16}});
+template <typename T>
+const kernel_cl<out_buffer, in_buffer, in_buffer, in_buffer, int>
+    cumulative_sum<int, T>::kernel3(
+        "cumulative_sum3", {"#define SCAL int\n", cumulative_sum3_kernel_code},
+        {{"REDUCTION_STEP_SIZE", 4}, {"LOCAL_SIZE_", 16}});
+
+}  // namespace opencl_kernels
+}  // namespace math
+}  // namespace stan
+#endif
+#endif
@@ -97,6 +97,7 @@
 #include <stan/math/opencl/to_ref_for_opencl.hpp>
 #include <stan/math/opencl/value_type.hpp>
 #include <stan/math/opencl/zeros_strict_tri.hpp>
+#include <stan/math/opencl/qr_decomposition.hpp>
 
 #include <stan/math/opencl/prim/add_diag.hpp>
 #include <stan/math/opencl/prim/append_array.hpp>
@@ -124,6 +125,7 @@
 #include <stan/math/opencl/prim/columns_dot_product.hpp>
 #include <stan/math/opencl/prim/columns_dot_self.hpp>
 #include <stan/math/opencl/prim/crossprod.hpp>
+#include <stan/math/opencl/prim/cumulative_sum.hpp>
 #include <stan/math/opencl/prim/diag_matrix.hpp>
 #include <stan/math/opencl/prim/diag_pre_multiply.hpp>
 #include <stan/math/opencl/prim/diag_post_multiply.hpp>
@@ -161,6 +163,7 @@
 #include <stan/math/opencl/prim/gumbel_lcdf.hpp>
 #include <stan/math/opencl/prim/gumbel_lpdf.hpp>
 #include <stan/math/opencl/prim/head.hpp>
+#include <stan/math/opencl/prim/identity_matrix.hpp>
 #include <stan/math/opencl/prim/inv.hpp>
 #include <stan/math/opencl/prim/inv_chi_square_lpdf.hpp>
 #include <stan/math/opencl/prim/inv_cloglog.hpp>
@@ -210,6 +213,10 @@
 #include <stan/math/opencl/prim/poisson_log_lpmf.hpp>
 #include <stan/math/opencl/prim/poisson_lpmf.hpp>
 #include <stan/math/opencl/prim/prod.hpp>
+#include <stan/math/opencl/prim/qr_Q.hpp>
+#include <stan/math/opencl/prim/qr_R.hpp>
+#include <stan/math/opencl/prim/qr_thin_Q.hpp>
+#include <stan/math/opencl/prim/qr_thin_R.hpp>
 #include <stan/math/opencl/prim/rank.hpp>
 #include <stan/math/opencl/prim/rayleigh_cdf.hpp>
 #include <stan/math/opencl/prim/rayleigh_lccdf.hpp>
Original file line number	Diff line number	Diff line change
`@@ -25,4 +25,4 @@ If this is a feature request, show what you expect to happen if the feature`
`25`	`25`
`26`	`26`
`27`	`27`	`#### Current Version:`
`28`		`-v4.0.1`
	`28`	`+v4.1.0`