import pandas as pd
import numpy as np
ts = pd.Series(np.random.randn(10))
ts[2:-2] = np.nan
sts = pd.arrays.SparseArray(ts)
# sts = ts.to_numpy()
# sts = ts.to_sparse()
print (sts)
[-0.2651666869710405, -0.5136353674319685, nan, nan, nan, nan, nan, nan, -1.0887837733294412, 1.0501437514527197] Fill: nan IntIndex Indices: array([0, 1, 8, 9], dtype=int32)
import pandas as pd
import numpy as np
创建含大量NaN的DataFrame
df = pd.DataFrame(np.random.randn(10000, 4))
只保留最后2行有值
df.loc[:9998] = np.nan
转换为稀疏DataFrame
sdf = df.astype(pd.SparseDtype("float", np.nan))
计算非空值比例
print(f"稀疏度: {sdf.sparse.density:.4f}")
稀疏度: 0.0001
print(f"内存节省: {(1 - sdf.memory_usage().sum() / df.memory_usage().sum())*100:.1f}%")
内存节省: 99.9%
通过调用 to_dense
可以将任何稀疏对象转换回标准密集形式。
import pandas as pd
import numpy as np
ts = pd.Series(np.random.randn(10))
ts[2:-2] = np.nan
转换为稀疏 Series
sts = ts.astype(pd.SparseDtype("float64", np.nan))
转回密集格式
print(sts.sparse.to_dense())
0 0.768291 1 0.528354 2 NaN 3 NaN 4 NaN 5 NaN 6 NaN 7 NaN 8 0.179859 9 -0.026992 dtype: float64
float64 − np.nan
int64 − 0
bool − False
执行下面的代码来理解相同的内容:
import pandas as pd
import numpy as np
s = pd.Series([1, np.nan, np.nan])
print (s)
0 1.0 1 NaN 2 NaN dtype: float64
print ("=============================")
=============================
sparse_s = s.astype(pd.SparseDtype("float64", np.nan))
print("稀疏Series:")
稀疏Series:
print(sparse_s)
0 1.0 1 NaN 2 NaN dtype: Sparse[float64, nan]