Skip to content

Commit 63f46c1

Browse files
committed
Merge pull request #6422 from jseabold/make-missing
ENH: Add functions for creating dataframes with NaNs
2 parents b06e9d0 + 8c09a9e commit 63f46c1

File tree

1 file changed

+66
-0
lines changed

1 file changed

+66
-0
lines changed

pandas/util/testing.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -896,6 +896,72 @@ def makeCustomDataframe(nrows, ncols, c_idx_names=True, r_idx_names=True,
896896
return DataFrame(data, index, columns, dtype=dtype)
897897

898898

899+
def _create_missing_idx(nrows, ncols, density, random_state=None):
900+
if random_state is None:
901+
random_state = np.random
902+
else:
903+
random_state = np.random.RandomState(random_state)
904+
905+
# below is cribbed from scipy.sparse
906+
size = int(np.round((1 - density) * nrows * ncols))
907+
# generate a few more to ensure unique values
908+
min_rows = 5
909+
fac = 1.02
910+
extra_size = min(size + min_rows, fac * size)
911+
912+
def _gen_unique_rand(rng, _extra_size):
913+
ind = rng.rand(int(_extra_size))
914+
return np.unique(np.floor(ind * nrows * ncols))[:size]
915+
916+
ind = _gen_unique_rand(random_state, extra_size)
917+
while ind.size < size:
918+
extra_size *= 1.05
919+
ind = _gen_unique_rand(random_state, extra_size)
920+
921+
j = np.floor(ind * 1. / nrows)
922+
i = (ind - j * nrows)
923+
return i.tolist(), j.tolist()
924+
925+
926+
def makeMissingCustomDataframe(nrows, ncols, density=.9, random_state=None,
927+
c_idx_names=True, r_idx_names=True,
928+
c_idx_nlevels=1, r_idx_nlevels=1,
929+
data_gen_f=None,
930+
c_ndupe_l=None, r_ndupe_l=None, dtype=None,
931+
c_idx_type=None, r_idx_type=None):
932+
"""
933+
Parameters
934+
----------
935+
Density : float, optional
936+
Float in (0, 1) that gives the percentage of non-missing numbers in
937+
the DataFrame.
938+
random_state : {np.random.RandomState, int}, optional
939+
Random number generator or random seed.
940+
941+
See makeCustomDataframe for descriptions of the rest of the parameters.
942+
"""
943+
df = makeCustomDataframe(nrows, ncols, c_idx_names=c_idx_names,
944+
r_idx_names=r_idx_names,
945+
c_idx_nlevels=c_idx_nlevels,
946+
r_idx_nlevels=r_idx_nlevels,
947+
data_gen_f=data_gen_f,
948+
c_ndupe_l=c_ndupe_l, r_ndupe_l=r_ndupe_l,
949+
dtype=dtype, c_idx_type=c_idx_type,
950+
r_idx_type=r_idx_type)
951+
952+
i, j = _create_missing_idx(nrows, ncols, density, random_state)
953+
df.iloc[i, j] = np.nan
954+
return df
955+
956+
957+
def makeMissingDataframe(density=.9, random_state=None):
958+
df = makeDataFrame()
959+
i, j = _create_missing_idx(*df.shape, density=density,
960+
random_state=random_state)
961+
df.iloc[i, j] = np.nan
962+
return df
963+
964+
899965
def add_nans(panel):
900966
I, J, N = panel.shape
901967
for i, item in enumerate(panel.items):

0 commit comments

Comments
 (0)