In [40]: df Out[40]: one two a 1.01.0 b 2.02.0 c 3.03.0 d NaN 4.0
In [41]: pd.DataFrame(d, index=["d", "b", "a"]) Out[41]: one two d NaN 4.0 b 2.02.0 a 1.01.0
In [42]: pd.DataFrame(d, index=["d", "b", "a"], columns=["two", "three"]) Out[42]: two three d 4.0 NaN b 2.0 NaN a 1.0 NaN In [43]: df.index Out[43]: Index(['a', 'b', 'c', 'd'], dtype='object')
In [44]: df.columns Out[44]: Index(['one', 'two'], dtype='object')
Note
当一组特定的列与数据的dict一起被传递时,传递的列会覆盖dict中的键。
2. 从Dict of ndarrays/list 中创建
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
In [45]: d = {"one": [1.0, 2.0, 3.0, 4.0], "two": [4.0, 3.0, 2.0, 1.0]}
In [46]: pd.DataFrame(d) Out[46]: one two 01.04.0 12.03.0 23.02.0 34.01.0
In [47]: pd.DataFrame(d, index=["a", "b", "c", "d"]) Out[47]: one two a 1.04.0 b 2.03.0 c 3.02.0 d 4.01.0
In [54]: pd.DataFrame(data2) Out[54]: a b c 012 NaN 151020.0
In [55]: pd.DataFrame(data2, index=["first", "second"]) Out[55]: a b c first 12 NaN second 51020.0
In [56]: pd.DataFrame(data2, columns=["a", "b"]) Out[56]: a b 012 1510
5. 从 dict of tuples 中创建
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
In [57]: pd.DataFrame( ....: { ....: ("a", "b"): {("A", "B"): 1, ("A", "C"): 2}, ....: ("a", "a"): {("A", "C"): 3, ("A", "B"): 4}, ....: ("a", "c"): {("A", "B"): 5, ("A", "C"): 6}, ....: ("b", "a"): {("A", "C"): 7, ("A", "B"): 8}, ....: ("b", "b"): {("A", "D"): 9, ("A", "B"): 10}, ....: } ....: ) ....: Out[57]: a b b a c a b A B 1.04.05.08.010.0 C 2.03.06.07.0 NaN D NaN NaN NaN NaN 9.0
6. 从 Series 直接创建
1 2 3 4 5 6 7 8
In [58]: ser = pd.Series(range(3), index=list("abc"), name="ser")
In [59]: pd.DataFrame(ser) Out[59]: ser a 0 b 1 c 2
7. 从 list of namedtuples 中创建
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
In [60]: from collections import namedtuple
In [61]: Point = namedtuple("Point", "x y")
In [62]: pd.DataFrame([Point(0, 0), Point(0, 3), (2, 3)]) Out[62]: x y 000 103 223
In [63]: Point3D = namedtuple("Point3D", "x y z")
In [64]: pd.DataFrame([Point3D(0, 0, 0), Point3D(0, 3, 5), Point(2, 3)]) Out[64]: x y z 0000.0 1035.0 223 NaN
8. 从 list of dataclasses 创建
1 2 3 4 5 6 7 8 9 10
In [65]: from dataclasses import make_dataclass
In [66]: Point = make_dataclass("Point", [("x", int), ("y", int)])
In [67]: pd.DataFrame([Point(0, 0), Point(0, 3), Point(2, 3)]) Out[67]: x y 000 103 223
In [98]: df * 5 + 2 Out[98]: A B C D 03.359299 -0.1248624.8351023.381160 1 -3.437003 -1.3684492.568242 -5.392133 24.6249384.0235264.885230 -6.575010 3 -3.1963420.146766 -3.789461 -4.721559 46.2244267.3788491.45475010.217815 5 -5.3469403.785103 -1.373001 -6.884519 6 -2.844569 -4.4726184.0686913.383309 7 -0.3601731.9302010.1872851.969232 8 -2.6153036.4785876.026220 -4.032059 914.8282309.1562808.701544 -3.851494
In [99]: 1 / df Out[99]: A B C D 03.678365 -2.3530941.7636053.620145 1 -0.919624 -1.4843638.799067 -0.676395 21.9048072.4709341.732964 -0.583090 3 -0.962215 -2.697986 -0.863638 -0.743875 41.1835930.929567 -9.1701080.608434 5 -0.6805552.800959 -1.482360 -0.562777 6 -1.032084 -0.7724852.4169883.614523 7 -2.118489 -71.634509 -2.758294 -162.507295 8 -1.0833521.1164241.241860 -0.828904 90.3897650.6986870.746097 -0.854483
In [100]: df ** 4 Out[100]: A B C D 00.0054623.261689e-020.1033705.822320e-03 11.3981652.059869e-010.0001674.777482e+00 20.0759622.682596e-020.1108778.650845e+00 31.1665711.887302e-021.7975153.265879e+00 40.5095551.339298e+000.0001417.297019e+00 54.6617171.624699e-020.2071039.969092e+00 60.8813342.808277e+000.0293025.858632e-03 70.0496473.797614e-080.0172761.433866e-09 80.7259746.437005e-010.4204462.118275e+00 943.3298214.196326e+003.2271531.875802e+00
In [108]: np.exp(df) Out[108]: A B C D 01.3124030.6537881.7630061.318154 10.3370920.5098241.1203580.227996 21.6904381.4988611.7807700.179963 30.3537130.6902880.3141480.260719 42.3277102.9322490.8966865.173571 50.2300661.4290650.5093600.169161 60.3794950.2740281.5124611.318720 70.6237320.9861370.6959040.993865 80.3973012.4490922.2372420.299269 913.0090594.1839513.8202230.310274
# 转化为string In [126]: print(baseball.iloc[-20:, :12].to_string()) id player year stint team lg g ab r h X2b X3b 8089474 finlest01 20071 COL NL 439491730 8189480 embreal01 20071 OAK AL 400000 8289481 edmonji01 20071 SLN NL 1173653992152 8389482 easleda01 20071 NYN NL 76193245460 8489489 delgaca01 20071 NYN NL 13953871139300 8589493 cormirh01 20071 CIN NL 600000 8689494 coninje01 20072 NYN NL 21412820 8789495 coninje01 20071 CIN NL 802152357111 8889497 clemero02 20071 NYA AL 220100 8989498 claytro01 20072 BOS AL 861000 9089499 claytro01 20071 TOR AL 691892348140 9189501 cirilje01 20072 ARI NL 28406840 9289502 cirilje01 20071 MIN AL 50153184092 9389521 bondsba01 20071 SFN NL 1263407594140 9489523 biggicr01 20071 HOU NL 14151768130313 9589525 benitar01 20072 FLO NL 3400000 9689526 benitar01 20071 SFN NL 1900000 9789530 ausmubr01 20071 HOU NL 1173493882163 9889533 aloumo01 20071 NYN NL 8732851112191 9989534 alomasa02 20071 NYN NL 8221310
# In [127]: pd.set_option("display.max_colwidth", 100) # optional In [128]: pd.set_option('display.max_rows', None) In [129]: pd.set_option('display.max_columns', None)
In [130]: print(baseball.iloc[-20:, :12]) id player year stint team lg g ab r h X2b X3b 8089474 finlest01 20071 COL NL 439491730 8189480 embreal01 20071 OAK AL 400000 8289481 edmonji01 20071 SLN NL 1173653992152 8389482 easleda01 20071 NYN NL 76193245460 8489489 delgaca01 20071 NYN NL 13953871139300 8589493 cormirh01 20071 CIN NL 600000 8689494 coninje01 20072 NYN NL 21412820 8789495 coninje01 20071 CIN NL 802152357111 8889497 clemero02 20071 NYA AL 220100 8989498 claytro01 20072 BOS AL 861000 9089499 claytro01 20071 TOR AL 691892348140 9189501 cirilje01 20072 ARI NL 28406840 9289502 cirilje01 20071 MIN AL 50153184092 9389521 bondsba01 20071 SFN NL 1263407594140 9489523 biggicr01 20071 HOU NL 14151768130313 9589525 benitar01 20072 FLO NL 3400000 9689526 benitar01 20071 SFN NL 1900000 9789530 ausmubr01 20071 HOU NL 1173493882163 9889533 aloumo01 20071 NYN NL 8732851112191 9989534 alomasa02 20071 NYN NL 8221310
>>> classes = ['a', 'b', 'c', 'd', 'e'] # 初始值为 0, 这样的话就比较好直接进行 matrix 相加. # 如果Matrix 中有 NaN,直接相加会得到 NaN. >>> cfm = pd.DataFrame(0, index=classes, columns=classes) >>> cfm a b c d e a 00000 b 00000 c 00000 d 00000 e 00000 # 这里使用随机值代替模型评估统计的结果. >>> cfm = cfm.applymap(lambda cell: np.random.randint(100) >>> cfm a b c d e a 503587425 b 363199890 c 4177645849 d 4545816440 e 3713851636 # 我们可以直接把数字换成百分比. 比如坐标的 index 代表的是 groundtruth. # column 代表的是 prediction. 这个 confusion matrix 就可以用来检测分类性能. >>> percentage_cfm = cfm.div(cfm.sum(axis=1)) >>>