diff --git a/RELEASE-NOTES.md b/RELEASE-NOTES.md
index d8bd2239ce..4806c7aa94 100644
--- a/RELEASE-NOTES.md
+++ b/RELEASE-NOTES.md
@@ -26,7 +26,7 @@
 - `dist_math.random_choice` now handles nd-arrays of category probabilities, and also handles sizes that are not `None`. Also removed unused `k` kwarg from `dist_math.random_choice`.
 - Changed `Categorical.mode` to preserve all the dimensions of `p` except the last one, which encodes each category's probability.
 - Changed initialization of `Categorical.p`. `p` is now normalized to sum to `1` inside `logp` and `random`, but not during initialization. This could hide negative values supplied to `p` as mentioned in #2082.
-- To be able to test for negative `p` values supplied to `Categorical`, `Categorical.logp` was changed to check for `sum(self.p, axis=-1) == 1` only if `self.p` is not a `Number`, `np.ndarray`, `TensorConstant` or `SharedVariable`. These cases are automatically normalized to sum to `1`. The other condition may originate from a `step_method` proposal, where `self.p` tensor's value may be set, but must sum to 1 nevertheless. This may break old code which intialized `p` with a theano expression and relied on the default normalization to get it to sum to 1. `Categorical.logp` now also checks that the used `p` has values lower than 1.
+- `Categorical` now accepts elements of `p` equal to `0`. `logp` will return `-inf` if there are `values` that index to the zero probability categories.
 
 ### Deprecations
 
diff --git a/pymc3/distributions/discrete.py b/pymc3/distributions/discrete.py
index 952f987773..1dbe399363 100644
--- a/pymc3/distributions/discrete.py
+++ b/pymc3/distributions/discrete.py
@@ -1,6 +1,4 @@
-import numbers
 import numpy as np
-import theano
 import theano.tensor as tt
 from scipy import stats
 import warnings
@@ -736,27 +734,16 @@ def logp(self, value):
         # Clip values before using them for indexing
         value_clip = tt.clip(value, 0, k - 1)
 
-        # We must only check that the values sum to 1 if p comes from a
-        # tensor variable, i.e. when p is a step_method proposal. In the other
-        # cases we normalize ourselves
-        if not isinstance(p_, (numbers.Number,
-                               np.ndarray,
-                               tt.TensorConstant,
-                               tt.sharedvar.SharedVariable)):
-            sumto1 = theano.gradient.zero_grad(
-                tt.le(abs(tt.sum(p_, axis=-1) - 1), 1e-5))
-            p = p_
-        else:
-            p = p_ / tt.sum(p_, axis=-1, keepdims=True)
-            sumto1 = True
+        p = p_ / tt.sum(p_, axis=-1, keepdims=True)
 
         if p.ndim > 1:
-            a = tt.log(np.moveaxis(p, -1, 0)[value_clip])
+            pattern = (p.ndim - 1,) + tuple(range(p.ndim - 1))
+            a = tt.log(p.dimshuffle(pattern)[value_clip])
         else:
             a = tt.log(p[value_clip])
 
-        return bound(a, value >= 0, value <= (k - 1), sumto1,
-                     tt.all(p_ > 0, axis=-1), tt.all(p <= 1, axis=-1))
+        return bound(a, value >= 0, value <= (k - 1),
+                     tt.all(p_ >= 0, axis=-1), tt.all(p <= 1, axis=-1))
 
     def _repr_latex_(self, name=None, dist=None):
         if dist is None:
diff --git a/pymc3/tests/test_distributions.py b/pymc3/tests/test_distributions.py
index 05eb14ea82..dbb02d36d8 100644
--- a/pymc3/tests/test_distributions.py
+++ b/pymc3/tests/test_distributions.py
@@ -349,7 +349,7 @@ def orderedlogistic_logpdf(value, eta, cutpoints):
     ps = np.array([invlogit(eta - cc) - invlogit(eta - cc1)
                    for cc, cc1 in zip(c[:-1], c[1:])])
     p = ps[value]
-    return np.where(np.all(ps > 0), np.log(p), -np.inf)
+    return np.where(np.all(ps >= 0), np.log(p), -np.inf)
 
 class Simplex:
     def __init__(self, n):