在现实生活场景中,丢失数据始终是一个问题。 由于缺失值导致数据质量不佳,机器学习和数据挖掘等领域在模型预测的准确性方面面临严重问题。 在这些领域,缺失值处理是使模型更加准确和有效的重点。
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randn(5, 3),
index=['a', 'c', 'e', 'f', 'h'],
columns=['one', 'two', 'three'])
df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
df
one | two | three | |
---|---|---|---|
a | 0.738658 | 1.210542 | 0.321815 |
b | NaN | NaN | NaN |
c | -0.341467 | -1.005193 | -0.086754 |
d | NaN | NaN | NaN |
e | -1.695711 | -1.226859 | 0.709378 |
f | -1.154536 | -0.453908 | -0.520920 |
g | NaN | NaN | NaN |
h | -1.469753 | 1.686908 | -1.798809 |
示例1
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f',
'h'],columns=['one', 'two', 'three'])
df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
print (df['one'].isnull())
a False b True c False d True e False f False g True h False Name: one, dtype: bool
示例2
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f',
'h'],columns=['one', 'two', 'three'])
df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
print (df['one'].notnull())
a True b False c True d False e True f True g False h True Name: one, dtype: bool
示例1
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f',
'h'],columns=['one', 'two', 'three'])
df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
print (df['one'].sum())
3.1508019285243445
示例2
import pandas as pd
import numpy as np
df = pd.DataFrame(index=[0,1,2,3,4,5],columns=['one','two'])
print (df['one'].sum())
0
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randn(3, 3),
index=['a', 'c', 'e'],
columns=['one', 'two', 'three'])
df = df.reindex(['a', 'b', 'c'])
print (df)
one two three a -0.236832 -0.763248 0.956353 b NaN NaN NaN c -1.799230 -1.782872 -0.681165
print ("NaN replaced with '0':")
NaN replaced with '0':
print (df.fillna(0))
one two three a -0.236832 -0.763248 0.956353 b 0.000000 0.000000 0.000000 c -1.799230 -1.782872 -0.681165
方法 | 动作 |
---|---|
pad/fill |
填充方法向前 |
bfill/backfill |
填充方法向后 |
示例1
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randn(5, 3),
index=['a', 'c', 'e', 'f','h'],
columns=['one', 'two', 'three'])
df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
print (df.fillna(method='pad'))
one two three a -0.569026 -0.611094 0.865132 b -0.569026 -0.611094 0.865132 c 0.080905 -1.132373 1.457304 d 0.080905 -1.132373 1.457304 e 2.237134 -0.952788 1.087192 f 0.871868 0.388134 0.238155 g 0.871868 0.388134 0.238155 h 0.133421 1.154708 1.322455
/tmp/ipykernel_1034/2314951150.py:1: FutureWarning: DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead. print (df.fillna(method='pad'))
示例2
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randn(5, 3),
index=['a', 'c', 'e', 'f', 'h'],
columns=['one', 'two', 'three'])
df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
print (df.fillna(method='backfill'))
one two three a 1.865700 0.270111 -0.348879 b -1.232328 0.887380 -0.081143 c -1.232328 0.887380 -0.081143 d 1.078365 0.223137 0.983797 e 1.078365 0.223137 0.983797 f -0.350317 -0.631913 -0.925956 g 0.237909 1.267474 -1.641670 h 0.237909 1.267474 -1.641670
/tmp/ipykernel_1034/1644860831.py:1: FutureWarning: DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead. print (df.fillna(method='backfill'))
示例1
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f',
'h'],columns=['one', 'two', 'three'])
df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
print (df.dropna())
one two three a 1.882943 -2.196758 -1.185387 c 0.364389 -0.857485 0.076410 e -1.214498 -1.067700 1.631134 f -0.433561 0.339309 -0.834340 h 0.770274 -0.730790 -2.266530
示例2
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f',
'h'],columns=['one', 'two', 'three'])
df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
print (df.dropna(axis=1))
Empty DataFrame Columns: [] Index: [a, b, c, d, e, f, g, h]
示例1
import pandas as pd
import numpy as np
df = pd.DataFrame({'one':[10,20,30,40,50,2000],
'two':[1000,0,30,40,50,60]})
print (df.replace({1000:10,2000:60}))
one two 0 10 10 1 20 0 2 30 30 3 40 40 4 50 50 5 60 60
示例2
import pandas as pd
import numpy as np
df = pd.DataFrame({'one':[10,20,30,40,50,2000],
'two':[1000,0,30,40,50,60]})
print (df.replace({1000:10,2000:60}))
one two 0 10 10 1 20 0 2 30 30 3 40 40 4 50 50 5 60 60