#encoding=utf-8 import numpy as np import pandas as pd def main(): #Data structure s=pd.Series([i*2 for i in range(1,11)]) print(type(s)) dates=pd.date_range('20170301',periods=8) df=pd.DataFrame(np.random.randn(8,5),index=dates,columns=list('ABCDE')) print(df) if __name__ == '__main__': main()
import pandas as pd
Series是pandas中基础的数据结构
date_range('20200202',periods=8)依次生成8天,天数作为主键,行标签
DataFrame(数据,index主键也就是行标签,colums也就是列标签)
类似excel
pandas总体框架
def main(): # Data Structure s=pd.Series([i*2 for i in range(1,11)]) print(type(s)) dates=pd.date_range("20170301",periods=8) df=pd.DataFrame(np.random.rand(8,5),index=dates,columns=list("ABCDE")) print(df) df = pd.DataFrame({"A": 1, "B": pd.Timestamp("20170301"), "C": pd.Series(1, index=list(range(4)), dtype="float32"), "D": np.array([3]*4, dtype="float32"), "E": pd.Categorical(["police", "student", "teacher", "doctor"])}) print(df)
s=pd.Series([i*2 for i in range(1,11)]) print(type(s)) dates=pd.date_range("20170301",periods=8) df=pd.DataFrame(np.random.randn(8,5),index=dates,columns=list("abcde"))
数据结构1
df = pd.DataFrame({"A":1,"B":pd.Timestamp("20170301"),"C":pd.Series(1,index=list(range(4)),dtype="float32","D":np.array([3]*4,dtype="float32"),"E":pd.Categorical(["police","student","teacher","doctor"])})
#定义时间序列 s = pd.Series([i*2 for i in range(1,11)]) dates = pd.date_range("20170301",periods=8) #从20170301开始的8天 #第一种定义表的方式:8行5列,索引值(行名)是dates,属性值(列名)是A~E df = pd.DataFrame(np.random.randn(8,5),index=dates,columns=list("ABCDE")) #第二种定义表的方式: df.DataFrame({"A":1,"B":pd.Timestamp("20170301"),"C":pd.Series(1,index=list(range(4)),dtype="float32"),\ "D":np.array([3]*4,dtype="float32"),"E":pd.Categorical(["police","student","teacher","doctor"])}) print(df.head(3)) #打印前三行 print(df.tail(3)) #打印后三行 print(df.index) print(df.values) #打印结果是数组 print(df.T) #索引和属性互换 print(df.sort(columns="C")) print(df.sort_index(axis=1,ascending=False)) #对列索引(axis=1-->属性值)进行降序排序 print(df.describe()) #打印出count、mean、std、min、max、25%、50%、75% #切片 print(df["A"]) #A列 print(df[:3]) #前三行 print(df["20170301":"20170304"]) print(df.loc[dates[0]]) print(df.loc["20170301":"20170304",["B","D"]]) print(df.at[dates[0],"C"]) print(df.iloc[1:3,2:4]) print(df.iloc[1,4]) print(df.iat[1,4]) print(df[df.B>0][df.A<0]) print(df[df>0]) #小于0的返回NaN print(df[df["E"].isin([1,2])]) #设置 s1 = pd.Series(list(range(10,18)),index=pd.date_range("20170301",periods=8)) df["F"] = s1 df.at[dates[0],"A"]=0 df.loc[:,"D"] = np.array([4]*len(df)) #拷贝 df2 = df.copy() df2[df2>0] = -df2 print(df2) #缺失值处理 df1 = df.reindex(index=dates[:4],columns=list("ABCD")+["G"]) df1.loc[dates[0]:dates[1],"G"] = 1 print(df1.dropna()) #删除缺失值所在行 print(df1.fillna(value=2)) #缺失值填充2 #统计指标 print(df.mean()) #每列的平均值 print(df.var()) s = pd.Series([1,2,3,np.nan,5,7,9,10],index=dates) print(s.shift(2)) #所有值后移两位,前两位补NaN,多的值删除 print(s.diff()) #不填表示一阶,填的数字表示多阶 print(s.calue_counts()) #每个值出现的次数 print(df.apply(np.cumsum)) print(df.apply(lambda x:x,max()-min())) #表的拼接 pieces = [df[:3],df[-3:]] left = pd.DataFrame({"key":["x","y"],"value":[1,2]}) right = pd.DataFrame({"key":["x","z"],"value":[3,4]}) print(pd.merge(left,right,on="key",how="left")) #how="inner":所有缺失值都删掉;"outer"所有缺失值都保留 df3 = pd.DataFrame({"A":["a","b","c","b"],"B":list(range(4))}) print(df3.groupby("A").sum()) #将A列每种属性值求和