stan-dev
diff --git a/‎stan/math/opencl/kernels/ordered_logistic_lpmf.hpp
+186 b/‎stan/math/opencl/kernels/ordered_logistic_lpmf.hpp
+186
diff --git a/‎stan/math/opencl/prim.hpp
+1 b/‎stan/math/opencl/prim.hpp
+1
diff --git a/‎stan/math/opencl/prim/ordered_logistic_lpmf.hpp
+163 b/‎stan/math/opencl/prim/ordered_logistic_lpmf.hpp
+163
diff --git a/‎stan/math/opencl/rev/operands_and_partials.hpp
+1-1 b/‎stan/math/opencl/rev/operands_and_partials.hpp
+1-1
@@ -0,0 +1,186 @@
+#ifndef STAN_MATH_OPENCL_KERNELS_ORDERED_LOGISTIC_LPMF_HPP
+#define STAN_MATH_OPENCL_KERNELS_ORDERED_LOGISTIC_LPMF_HPP
+#ifdef STAN_OPENCL
+
+#include <stan/math/opencl/kernel_cl.hpp>
+#include <stan/math/opencl/kernels/device_functions/log1m_exp.hpp>
+#include <stan/math/opencl/kernels/device_functions/log1p_exp.hpp>
+
+namespace stan {
+namespace math {
+namespace opencl_kernels {
+
+// \cond
+static const char* ordered_logistic_kernel_code = STRINGIFY(
+    // \endcond
+    /** \ingroup opencl_kernels
+     * GPU implementation of ordinal regression.
+     *
+     * Must be run with at least N_instances threads and local size equal to
+     * LOCAL_SIZE_.
+     * @param[out] logp_global partially summed log probability (1 value per
+     * work group)
+     * @param[out] lambda_derivative derivative wrt lambda
+     * @param[out] cuts_derivative partially summed derivative wrt cuts (1
+     * column per work group)
+     * @param[in] y_global a scalar or vector of classes.
+     * @param[in] lambda_global vector of continuous lambda variables
+     * @param[in] cuts cutpoints vector
+     * @param N_instances number of cases
+     * @param N_classes number of classes
+     * @param is_y_vector 0 or 1 - whether y is a vector (alternatively it is a
+     * scalar we need to broadcast)
+     * @param is_cuts_matrix 0 or 1 - whether cuts is a matrix (alternatively it
+     * is a vector we need to broadcast)
+     * @param need_lambda_derivative 0 or 1 - whether lambda_derivative needs to
+     * be computed
+     * @param need_cuts_derivative 0 or 1 - whether cuts_derivative needs to be
+     * computed
+     */
+    __kernel void ordered_logistic(
+        __global double* logp_global, __global double* lambda_derivative,
+        __global double* cuts_derivative, const __global int* y_global,
+        const __global double* lambda_global, const __global double* cuts,
+        const int N_instances, const int N_classes, const int is_y_vector,
+        const int is_cuts_matrix, const int need_lambda_derivative,
+        const int need_cuts_derivative) {
+      const int gid = get_global_id(0);
+      const int lid = get_local_id(0);
+      const int lsize = get_local_size(0);
+      const int wg_id = get_group_id(0);
+      const int ngroups = get_num_groups(0);
+
+      __local double local_storage[LOCAL_SIZE_];
+
+      double logp = 0;
+      double d1 = 0;
+      double d2 = 0;
+      int y;
+      int cuts_start = (N_classes - 1) * gid * is_cuts_matrix;
+      // Most calculations only happen for relevant data within next if.
+      // Exceptions are reductions between threads that need barriers.
+      if (gid < N_instances) {
+        double lambda = lambda_global[gid];
+        y = y_global[gid * is_y_vector];
+        if (y < 1 || y > N_classes || !isfinite(lambda)) {
+          logp = NAN;
+        } else {
+          const double cut_y1
+              = y == N_classes ? INFINITY : cuts[cuts_start + y - 1];
+          const double cut_y2 = y == 1 ? -INFINITY : cuts[cuts_start + y - 2];
+          const double cut1 = lambda - cut_y1;
+          const double cut2 = lambda - cut_y2;
+
+          if (y != N_classes) {
+            logp -= log1p_exp(cut1);
+          }
+          if (y != 1) {
+            logp -= log1p_exp(-cut2);
+          }
+          if (y != 1 && y != N_classes) {
+            logp += log1m_exp(cut1 - cut2);
+          }
+
+          if (need_lambda_derivative || need_cuts_derivative) {
+            double exp_cuts_diff = exp(cut_y2 - cut_y1);
+            if (cut2 > 0) {
+              double exp_m_cut2 = exp(-cut2);
+              d1 = exp_m_cut2 / (1 + exp_m_cut2);
+            } else {
+              d1 = 1 / (1 + exp(cut2));
+            }
+            d1 -= exp_cuts_diff / (exp_cuts_diff - 1);
+            d2 = 1 / (1 - exp_cuts_diff);
+            if (cut1 > 0) {
+              double exp_m_cut1 = exp(-cut1);
+              d2 -= exp_m_cut1 / (1 + exp_m_cut1);
+            } else {
+              d2 -= 1 / (1 + exp(cut1));
+            }
+
+            if (need_lambda_derivative) {
+              lambda_derivative[gid] = d1 - d2;
+            }
+          }
+        }
+      }
+      if (need_cuts_derivative) {
+        if (is_cuts_matrix) {
+          if (gid < N_instances) {
+            for (int i = 0; i < N_classes - 1; i++) {
+              if (y - 1 == i) {
+                cuts_derivative[cuts_start + i] = d2;
+              } else if (y - 2 == i) {
+                cuts_derivative[cuts_start + i] = -d1;
+              } else {
+                cuts_derivative[cuts_start + i] = 0.0;
+              }
+            }
+          }
+        } else {
+          for (int i = 0; i < N_classes - 1; i++) {
+            local_storage[lid] = 0;
+            if (gid < N_instances) {
+              if (y - 1 == i) {
+                local_storage[lid] = d2;
+              } else if (y - 2 == i) {
+                local_storage[lid] = -d1;
+              }
+            }
+            // Sum cuts_derivative, calculated by different threads.
+            // Since we can't sum between different work groups, we emit one
+            // number per work group. These must be summed on CPU for final
+            // result.
+            barrier(CLK_LOCAL_MEM_FENCE);
+            for (int step = lsize / REDUCTION_STEP_SIZE; step > 0;
+                 step /= REDUCTION_STEP_SIZE) {
+              if (lid < step) {
+                for (int i = 1; i < REDUCTION_STEP_SIZE; i++) {
+                  local_storage[lid] += local_storage[lid + step * i];
+                }
+              }
+              barrier(CLK_LOCAL_MEM_FENCE);
+            }
+            if (lid == 0) {
+              cuts_derivative[(N_classes - 1) * wg_id + i] = local_storage[0];
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);
+          }
+        }
+      }
+      local_storage[lid] = logp;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      for (int step = lsize / REDUCTION_STEP_SIZE; step > 0;
+           step /= REDUCTION_STEP_SIZE) {
+        if (lid < step) {
+          for (int i = 1; i < REDUCTION_STEP_SIZE; i++) {
+            local_storage[lid] += local_storage[lid + step * i];
+          }
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+      }
+      if (lid == 0) {
+        logp_global[wg_id] = local_storage[0];
+      }
+    }
+    // \cond
+);
+// \endcond
+
+/** \ingroup opencl_kernels
+ * See the docs for \link kernels/ordered_logistic_lpmf.hpp
+ * ordered_logistic() \endlink
+ */
+const kernel_cl<out_buffer, out_buffer, out_buffer, in_buffer, in_buffer,
+                in_buffer, int, int, int, int, int, int>
+    ordered_logistic("ordered_logistic",
+                     {log1p_exp_device_function, log1m_exp_device_function,
+                      ordered_logistic_kernel_code},
+                     {{"REDUCTION_STEP_SIZE", 4}, {"LOCAL_SIZE_", 64}});
+
+}  // namespace opencl_kernels
+}  // namespace math
+}  // namespace stan
+
+#endif
+#endif
@@ -156,6 +156,7 @@
 #include <stan/math/opencl/prim/normal_lpdf.hpp>
 #include <stan/math/opencl/prim/num_elements.hpp>
 #include <stan/math/opencl/prim/ordered_logistic_glm_lpmf.hpp>
+#include <stan/math/opencl/prim/ordered_logistic_lpmf.hpp>
 #include <stan/math/opencl/prim/pareto_lpdf.hpp>
 #include <stan/math/opencl/prim/pareto_type_2_lpdf.hpp>
 #include <stan/math/opencl/prim/poisson_log_glm_lpmf.hpp>
 
@@ -0,0 +1,163 @@
+#ifndef STAN_MATH_OPENCL_PRIM_ORDERED_LOGISTIC_LPMF_HPP
+#define STAN_MATH_OPENCL_PRIM_ORDERED_LOGISTIC_LPMF_HPP
+#ifdef STAN_OPENCL
+
+#include <stan/math/opencl/kernel_generator.hpp>
+#include <stan/math/opencl/kernels/add.hpp>
+#include <stan/math/opencl/kernels/ordered_logistic_lpmf.hpp>
+#include <stan/math/prim/meta.hpp>
+#include <stan/math/prim/err.hpp>
+#include <stan/math/prim/fun/constants.hpp>
+#include <stan/math/prim/fun/elt_divide.hpp>
+#include <stan/math/prim/fun/elt_multiply.hpp>
+#include <stan/math/prim/functor/operands_and_partials.hpp>
+#include <stan/math/prim/err/constraint_tolerance.hpp>
+
+namespace stan {
+namespace math {
+
+/** \ingroup opencl
+ * Returns the (natural) log probability of the specified array
+ * of integers given the vector of continuous locations and
+ * specified cutpoints in an ordered logistic model.
+ *
+ * <p>Typically the continuous lambda
+ * will be the dot product of a vector of regression coefficients
+ * and a vector of predictors for the outcome
+ *
+  \f[
+    \frac{\partial }{\partial \lambda} =
+    \begin{cases}\\
+    -\mathrm{logit}^{-1}(\lambda - c_1) & \mbox{if } k = 1,\\
+    -(((1-e^{c_{k-1}-c_{k-2}})^{-1} - \mathrm{logit}^{-1}(c_{k-2}-\lambda)) +
+    ((1-e^{c_{k-2}-c_{k-1}})^{-1} - \mathrm{logit}^{-1}(c_{k-1}-\lambda)))
+    & \mathrm{if } 1 < k < K, \mathrm{and}\\
+    \mathrm{logit}^{-1}(c_{K-2}-\lambda) & \mathrm{if } k = K.
+    \end{cases}
+  \f]
+
+  \f[
+    \frac{\partial }{\partial \lambda} =
+    \begin{cases}
+    -\mathrm{logit}^{-1}(\lambda - c_1) & \text{if } k = 1,\\
+    -(((1-e^{c_{k-1}-c_{k-2}})^{-1} - \mathrm{logit}^{-1}(c_{k-2}-\lambda)) +
+    ((1-e^{c_{k-2}-c_{k-1}})^{-1} - \mathrm{logit}^{-1}(c_{k-1}-\lambda)))
+    & \text{if } 1 < k < K, \text{ and}\\
+    \mathrm{logit}^{-1}(c_{K-2}-\lambda) & \text{if } k = K.
+    \end{cases}
+  \f]
+ *
+ * @tparam propto True if calculating up to a proportion.
+ * @tparam T_y Y variable type (integer or array of integers).
+ * @tparam T_loc lambda type.
+ * @tparam T_cut Cut-point type.
+ * @param y Array of integers
+ * @param lambda Vector of continuous lambda variables.
+ * @param cuts Positive increasing vector of cutpoints.
+ * @return Log probability of outcome given lambda and
+ * cutpoints.
+ * @throw std::domain_error If the outcome is not between 1 and
+ * the number of cutpoints plus 2; if the cutpoint vector is
+ * empty; if the cutpoint vector contains a non-positive,
+ * non-finite value; or if the cutpoint vector is not sorted in
+ * ascending order.
+ * @throw std::invalid_argument If y and lambda are different
+ * lengths.
+ */
+template <bool propto, typename T_y_cl, typename T_loc_cl, typename T_cuts_cl,
+          require_all_prim_or_rev_kernel_expression_t<T_y_cl, T_loc_cl,
+                                                      T_cuts_cl>* = nullptr>
+inline return_type_t<T_y_cl, T_loc_cl, T_cuts_cl> ordered_logistic_lpmf(
+    const T_y_cl& y, const T_loc_cl& lambda, const T_cuts_cl& cuts) {
+  constexpr bool is_y_vector = !is_stan_scalar<T_y_cl>::value;
+  static const char* function = "ordered_logistic_lpmf(OpenCL)";
+
+  if (size(y) != 1) {
+    check_size_match(function, "Size of ", "y", size(y), "Size of", "lambda",
+                     size(lambda));
+  }
+
+  int N_instances = max_size(y, lambda);
+  int N_classes = cuts.rows() + 1;
+  int N_cut_sets = cuts.cols();
+
+  if (N_cut_sets > 1) {
+    check_size_match(function, "Length of lambda variables ", N_instances,
+                     "Number of cutpoint vectors ", N_cut_sets);
+  }
+  if (N_instances == 0 || N_classes == 1) {
+    return 0.0;
+  }
+  const auto& cuts_val = eval(value_of(cuts));
+  if (N_classes >= 2) {
+    auto cuts_head
+        = block_zero_based(cuts_val, 0, 0, cuts.rows() - 1, N_cut_sets);
+    auto cuts_tail
+        = block_zero_based(cuts_val, 1, 0, cuts.rows() - 1, N_cut_sets);
+    check_cl(function, "Cuts", cuts_head, "ordered and finite")
+        = cuts_head < cuts_tail && isfinite(cuts_head) && isfinite(cuts_tail);
+  } else if (N_classes == 1) {
+    check_cl(function, "Cuts", cuts_val, "finite") = isfinite(cuts_val);
+  }
+
+  if (!include_summand<propto, T_loc_cl, T_cuts_cl>::value) {
+    return 0.0;
+  }
+
+  const auto& y_val = eval(value_of(y));
+  const auto& lambda_val = eval(value_of(lambda));
+
+  const auto& y_val_cl = to_matrix_cl(y_val);
+
+  const int local_size
+      = opencl_kernels::ordered_logistic.get_option("LOCAL_SIZE_");
+  const int wgs = (N_instances + local_size - 1) / local_size;
+
+  bool need_lambda_derivative = !is_constant_all<T_loc_cl>::value;
+  bool need_cuts_derivative = !is_constant_all<T_cuts_cl>::value;
+  bool need_broadcasting = N_cut_sets == 1 && N_instances != 1;
+  matrix_cl<double> logp_cl(wgs, 1);
+  matrix_cl<double> lambda_derivative_cl(N_instances,
+                                         need_lambda_derivative ? 1 : 0);
+  matrix_cl<double> cuts_derivative_cl(
+      N_classes - 1,
+      need_cuts_derivative ? (need_broadcasting ? wgs : N_cut_sets) : 0);
+
+  try {
+    opencl_kernels::ordered_logistic(
+        cl::NDRange(local_size * wgs), cl::NDRange(local_size), logp_cl,
+        lambda_derivative_cl, cuts_derivative_cl, y_val_cl, lambda_val,
+        cuts_val, N_instances, N_classes, is_y_vector, !need_broadcasting,
+        need_lambda_derivative, need_cuts_derivative);
+  } catch (const cl::Error& e) {
+    check_opencl_error(function, e);
+  }
+
+  double logp = sum(from_matrix_cl(logp_cl));
+
+  if (!std::isfinite(logp)) {
+    results(check_cl(function, "Vector of dependent variables", y_val,
+                     "between 0 and number of classes"),
+            check_cl(function, "lambda vector", lambda_val, "finite"))
+        = expressions(y_val >= 1 && y_val <= static_cast<int>(N_classes),
+                      isfinite(lambda_val));
+  }
+  operands_and_partials<T_loc_cl, T_cuts_cl> ops_partials(lambda, cuts);
+
+  if (!is_constant_all<T_loc_cl>::value) {
+    ops_partials.edge1_.partials_ = lambda_derivative_cl;
+  }
+  if (!is_constant_all<T_cuts_cl>::value) {
+    if (need_broadcasting) {
+      ops_partials.edge2_.partials_ = rowwise_sum(cuts_derivative_cl);
+    } else {
+      ops_partials.edge2_.partials_ = std::move(cuts_derivative_cl);
+    }
+  }
+  return ops_partials.build(logp);
+}
+
+}  // namespace math
+}  // namespace stan
+#endif
+#endif
@@ -20,7 +20,7 @@ class ops_partials_edge<double, var_value<Op>, require_matrix_cl_t<Op>> {
   partials_t partials_;                       // For univariate use-cases
   broadcast_array<partials_t> partials_vec_;  // For multivariate
   explicit ops_partials_edge(const var_value<Op>& ops)
-      : partials_(constant(0, ops.vi_->rows(), ops.vi_->cols())),
+      : partials_(constant(0.0, ops.vi_->rows(), ops.vi_->cols())),
         partials_vec_(partials_),
         operands_(ops) {}