In [1]:
import pandas as pd
In [3]:
my_dict = {'X': [1,2,3,4], 'Y': [5,6,7,8], 'Z': [4,5,6,7]}
print(my_dict)
{'X': [1, 2, 3, 4], 'Y': [5, 6, 7, 8], 'Z': [4, 5, 6, 7]}
In [6]:
df = pd.DataFrame(my_dict)
df
Out[6]:
X | Y | Z | |
---|---|---|---|
0 | 1 | 5 | 4 |
1 | 2 | 6 | 5 |
2 | 3 | 7 | 6 |
3 | 4 | 8 | 7 |
In [8]:
df2 = pd.DataFrame({'X': [1,2,3,4], 'Y': [5,6,7,8], 'Z': [4,5,6,7]})
In [9]:
df == df2
Out[9]:
X | Y | Z | |
---|---|---|---|
0 | True | True | True |
1 | True | True | True |
2 | True | True | True |
3 | True | True | True |
In [10]:
df['X']
Out[10]:
X | |
---|---|
0 | 1 |
1 | 2 |
2 | 3 |
3 | 4 |
Axis:
- 0: raws (default)
- 1: columns
In [11]:
type(df['X'])
Out[11]:
pandas.core.series.Series
def __init__(data=None, index=None, dtype: Dtype | None=None, name=None, copy: bool | None=None, fastpath: bool=False) -> None
One-dimensional ndarray with axis labels (including time series). Labels need not be unique but must be a hashable type. The object supports both integer- and label-based indexing and provides a host of methods for performing operations involving the index. Statistical methods from ndarray have been overridden to automatically exclude missing data (currently represented as NaN). Operations between Series (+, -, /, \*, \*\*) align values based on their associated index values-- they need not be the same length. The result index will be the sorted union of the two indexes. Parameters ---------- data : array-like, Iterable, dict, or scalar value Contains data stored in Series. If data is a dict, argument order is maintained. index : array-like or Index (1d) Values must be hashable and have the same length as `data`. Non-unique index values are allowed. Will default to RangeIndex (0, 1, 2, ..., n) if not provided. If data is dict-like and index is None, then the keys in the data are used as the index. If the index is not None, the resulting Series is reindexed with the index values. dtype : str, numpy.dtype, or ExtensionDtype, optional Data type for the output Series. If not specified, this will be inferred from `data`. See the :ref:`user guide <basics.dtypes>` for more usages. name : Hashable, default None The name to give to the Series. copy : bool, default False Copy input data. Only affects Series or 1d ndarray input. See examples. Notes ----- Please reference the :ref:`User Guide <basics.series>` for more information. Examples -------- Constructing Series from a dictionary with an Index specified >>> d = {'a': 1, 'b': 2, 'c': 3} >>> ser = pd.Series(data=d, index=['a', 'b', 'c']) >>> ser a 1 b 2 c 3 dtype: int64 The keys of the dictionary match with the Index values, hence the Index values have no effect. >>> d = {'a': 1, 'b': 2, 'c': 3} >>> ser = pd.Series(data=d, index=['x', 'y', 'z']) >>> ser x NaN y NaN z NaN dtype: float64 Note that the Index is first build with the keys from the dictionary. After this the Series is reindexed with the given Index values, hence we get all NaN as a result. Constructing Series from a list with `copy=False`. >>> r = [1, 2] >>> ser = pd.Series(r, copy=False) >>> ser.iloc[0] = 999 >>> r [1, 2] >>> ser 0 999 1 2 dtype: int64 Due to input data type the Series has a `copy` of the original data even though `copy=False`, so the data is unchanged. Constructing Series from a 1d ndarray with `copy=False`. >>> r = np.array([1, 2]) >>> ser = pd.Series(r, copy=False) >>> ser.iloc[0] = 999 >>> r array([999, 2]) >>> ser 0 999 1 2 dtype: int64 Due to input data type the Series has a `view` on the original data, so the data is changed as well.
In [12]:
type(df)
Out[12]:
pandas.core.frame.DataFrame
def __init__(data=None, index: Axes | None=None, columns: Axes | None=None, dtype: Dtype | None=None, copy: bool | None=None) -> None
Two-dimensional, size-mutable, potentially heterogeneous tabular data. Data structure also contains labeled axes (rows and columns). Arithmetic operations align on both row and column labels. Can be thought of as a dict-like container for Series objects. The primary pandas data structure. Parameters ---------- data : ndarray (structured or homogeneous), Iterable, dict, or DataFrame Dict can contain Series, arrays, constants, dataclass or list-like objects. If data is a dict, column order follows insertion-order. If a dict contains Series which have an index defined, it is aligned by its index. This alignment also occurs if data is a Series or a DataFrame itself. Alignment is done on Series/DataFrame inputs. If data is a list of dicts, column order follows insertion-order. index : Index or array-like Index to use for resulting frame. Will default to RangeIndex if no indexing information part of input data and no index provided. columns : Index or array-like Column labels to use for resulting frame when data does not have them, defaulting to RangeIndex(0, 1, 2, ..., n). If data contains column labels, will perform column selection instead. dtype : dtype, default None Data type to force. Only a single dtype is allowed. If None, infer. copy : bool or None, default None Copy data from inputs. For dict data, the default of None behaves like ``copy=True``. For DataFrame or 2d ndarray input, the default of None behaves like ``copy=False``. If data is a dict containing one or more Series (possibly of different dtypes), ``copy=False`` will ensure that these inputs are not copied. .. versionchanged:: 1.3.0 See Also -------- DataFrame.from_records : Constructor from tuples, also record arrays. DataFrame.from_dict : From dicts of Series, arrays, or dicts. read_csv : Read a comma-separated values (csv) file into DataFrame. read_table : Read general delimited file into DataFrame. read_clipboard : Read text from clipboard into DataFrame. Notes ----- Please reference the :ref:`User Guide <basics.dataframe>` for more information. Examples -------- Constructing DataFrame from a dictionary. >>> d = {'col1': [1, 2], 'col2': [3, 4]} >>> df = pd.DataFrame(data=d) >>> df col1 col2 0 1 3 1 2 4 Notice that the inferred dtype is int64. >>> df.dtypes col1 int64 col2 int64 dtype: object To enforce a single dtype: >>> df = pd.DataFrame(data=d, dtype=np.int8) >>> df.dtypes col1 int8 col2 int8 dtype: object Constructing DataFrame from a dictionary including Series: >>> d = {'col1': [0, 1, 2, 3], 'col2': pd.Series([2, 3], index=[2, 3])} >>> pd.DataFrame(data=d, index=[0, 1, 2, 3]) col1 col2 0 0 NaN 1 1 NaN 2 2 2.0 3 3 3.0 Constructing DataFrame from numpy ndarray: >>> df2 = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), ... columns=['a', 'b', 'c']) >>> df2 a b c 0 1 2 3 1 4 5 6 2 7 8 9 Constructing DataFrame from a numpy ndarray that has labeled columns: >>> data = np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)], ... dtype=[("a", "i4"), ("b", "i4"), ("c", "i4")]) >>> df3 = pd.DataFrame(data, columns=['c', 'a']) ... >>> df3 c a 0 3 1 1 6 4 2 9 7 Constructing DataFrame from dataclass: >>> from dataclasses import make_dataclass >>> Point = make_dataclass("Point", [("x", int), ("y", int)]) >>> pd.DataFrame([Point(0, 0), Point(0, 3), Point(2, 3)]) x y 0 0 0 1 0 3 2 2 3 Constructing DataFrame from Series/DataFrame: >>> ser = pd.Series([1, 2, 3], index=["a", "b", "c"]) >>> df = pd.DataFrame(data=ser, index=["a", "c"]) >>> df 0 a 1 c 3 >>> df1 = pd.DataFrame([1, 2, 3], index=["a", "b", "c"], columns=["x"]) >>> df2 = pd.DataFrame(data=df1, index=["a", "c"]) >>> df2 x a 1 c 3
In [15]:
df['X'][1]
Out[15]:
2
In [14]:
df.X
Out[14]:
X | |
---|---|
0 | 1 |
1 | 2 |
2 | 3 |
3 | 4 |
In [18]:
df.index
Out[18]:
RangeIndex(start=0, stop=4, step=1)
In [19]:
df.columns
Out[19]:
Index(['X', 'Y', 'Z'], dtype='object')
In [24]:
import numpy as np
df['W'] = np.r_[1:9:2]
df
Out[24]:
X | Y | Z | W | |
---|---|---|---|---|
0 | 1 | 5 | 4 | 1 |
1 | 2 | 6 | 5 | 3 |
2 | 3 | 7 | 6 | 5 |
3 | 4 | 8 | 7 | 7 |
In [34]:
df = pd.DataFrame(my_dict)
df['W'] = np.r_[1:9:2]
df2 = df.drop('W', axis=1) # new variable (copy, not alias)
df2
df.drop('W', axis=1, inplace=True) # mutation
df.drop(columns=['X'])
Out[34]:
Y | Z | |
---|---|---|
0 | 5 | 4 |
1 | 6 | 5 |
2 | 7 | 6 |
3 | 8 | 7 |
In [35]:
df = pd.DataFrame(my_dict)
df['W'] = np.r_[1:9:2]
df2 = df.drop('W') # the error is since default axis=0
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) <ipython-input-35-9513f2017459> in <cell line: 3>() 1 df = pd.DataFrame(my_dict) 2 df['W'] = np.r_[1:9:2] ----> 3 df2 = df.drop('W') /usr/local/lib/python3.10/dist-packages/pandas/core/frame.py in drop(self, labels, axis, index, columns, level, inplace, errors) 5342 weight 1.0 0.8 5343 """ -> 5344 return super().drop( 5345 labels=labels, 5346 axis=axis, /usr/local/lib/python3.10/dist-packages/pandas/core/generic.py in drop(self, labels, axis, index, columns, level, inplace, errors) 4709 for axis, labels in axes.items(): 4710 if labels is not None: -> 4711 obj = obj._drop_axis(labels, axis, level=level, errors=errors) 4712 4713 if inplace: /usr/local/lib/python3.10/dist-packages/pandas/core/generic.py in _drop_axis(self, labels, axis, level, errors, only_slice) 4751 new_axis = axis.drop(labels, level=level, errors=errors) 4752 else: -> 4753 new_axis = axis.drop(labels, errors=errors) 4754 indexer = axis.get_indexer(new_axis) 4755 /usr/local/lib/python3.10/dist-packages/pandas/core/indexes/base.py in drop(self, labels, errors) 6998 if mask.any(): 6999 if errors != "ignore": -> 7000 raise KeyError(f"{labels[mask].tolist()} not found in axis") 7001 indexer = indexer[~mask] 7002 return self.delete(indexer) KeyError: "['W'] not found in axis"
In [40]:
# iloc: index-based location (matrix-like) [rows, columns]
df = pd.DataFrame(my_dict)
df['W'] = np.r_[1:9:2]
df.iloc[:2,3:4] # does not include 2 (standart Python convention)
Out[40]:
W | |
---|---|
0 | 1 |
1 | 3 |
In [46]:
# loc: label based
df = pd.DataFrame(my_dict)
df['W'] = np.r_[1:9:2]
df.loc[1:2, 'X'] # does include 2 (Matlab-like) !!!!!
Out[46]:
X | |
---|---|
1 | 2 |
2 | 3 |
In [47]:
df2 = df > 3
In [49]:
df[df.X > 3]
Out[49]:
X | Y | Z | W | |
---|---|---|---|---|
3 | 4 | 8 | 7 | 7 |
|
or &
and
In [51]:
df[(df.X > 3) | (df.Y < 6)]
Out[51]:
X | Y | Z | W | |
---|---|---|---|---|
0 | 1 | 5 | 4 | 1 |
3 | 4 | 8 | 7 | 7 |
In [52]:
df[df.X > 3 | df.Y < 6]
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-52-26767fb6b8c1> in <cell line: 1>() ----> 1 df[df.X > 3 | df.Y < 6] /usr/local/lib/python3.10/dist-packages/pandas/core/generic.py in __nonzero__(self) 1517 @final 1518 def __nonzero__(self) -> NoReturn: -> 1519 raise ValueError( 1520 f"The truth value of a {type(self).__name__} is ambiguous. " 1521 "Use a.empty, a.bool(), a.item(), a.any() or a.all()." ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
In [60]:
df.X - np.r_[7:3:-1]
df.X/df.Y
df['A'] = df.X**2
df
Out[60]:
X | Y | Z | W | A | |
---|---|---|---|---|---|
0 | 1 | 5 | 4 | 1 | 1 |
1 | 2 | 6 | 5 | 3 | 4 |
2 | 3 | 7 | 6 | 5 | 9 |
3 | 4 | 8 | 7 | 7 | 16 |
In [64]:
df.mean(axis=0)
Out[64]:
0 | |
---|---|
X | 2.5 |
Y | 6.5 |
Z | 5.5 |
W | 4.0 |
A | 7.5 |
In [66]:
type(df.A.mean())
Out[66]:
numpy.float64
In [67]:
df.corr()
Out[67]:
X | Y | Z | W | A | |
---|---|---|---|---|---|
X | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.984374 |
Y | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.984374 |
Z | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.984374 |
W | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.984374 |
A | 0.984374 | 0.984374 | 0.984374 | 0.984374 | 1.000000 |
groupby
¶
In [69]:
df = pd.DataFrame({'key':['A','B','C','A','B','C','A','B','C'],
'data': [0, 5, 10, 5, 10, 15, 10, 15, 20],
'data2': range(9,0,-1)})
df
Out[69]:
key | data | data2 | |
---|---|---|---|
0 | A | 0 | 9 |
1 | B | 5 | 8 |
2 | C | 10 | 7 |
3 | A | 5 | 6 |
4 | B | 10 | 5 |
5 | C | 15 | 4 |
6 | A | 10 | 3 |
7 | B | 15 | 2 |
8 | C | 20 | 1 |
Mean of data
for each key
In [71]:
df.groupby('key').mean()
Out[71]:
data | data2 | |
---|---|---|
key | ||
A | 5.0 | 6.0 |
B | 10.0 | 5.0 |
C | 15.0 | 4.0 |
In [72]:
df.groupby('key')['data'].mean()
Out[72]:
data | |
---|---|
key | |
A | 5.0 |
B | 10.0 |
C | 15.0 |
In [74]:
df.groupby('key').mean()['data']
df2 = df.groupby('key').mean()
df2['data']
Out[74]:
data | |
---|---|
key | |
A | 5.0 |
B | 10.0 |
C | 15.0 |
In [76]:
df.groupby('key').first()
Out[76]:
data | data2 | |
---|---|---|
key | ||
A | 0 | 9 |
B | 5 | 8 |
C | 10 | 7 |
aggregate
In [85]:
df.iloc[:,1:].aggregate(['mean','min'])
Out[85]:
data | data2 | |
---|---|---|
mean | 10.0 | 5.0 |
min | 0.0 | 1.0 |
In [87]:
df.aggregate({'data': 'mean', 'data2': 'min'})
Out[87]:
0 | |
---|---|
data | 10.0 |
data2 | 1.0 |
In [89]:
def my_func(x):
return x.max() - x.min()
df.groupby('key').aggregate(my_func)
df.aggregate({'data': my_func})
Out[89]:
0 | |
---|---|
data | 20 |
In [91]:
df['data'].plot()
Out[91]:
<Axes: >
In [95]:
a = np.array([1,np.nan,3])
a.sum()
Out[95]:
nan
In [100]:
df = pd.DataFrame({'key':['A','B','C','A','B','C','A','B','C'],
'data': [0, np.nan, 10, 5, 10, 15, 10, 15, 20],
'data2': range(9,0,-1)})
df
Out[100]:
key | data | data2 | |
---|---|---|---|
0 | A | 0.0 | 9 |
1 | B | NaN | 8 |
2 | C | 10.0 | 7 |
3 | A | 5.0 | 6 |
4 | B | 10.0 | 5 |
5 | C | 15.0 | 4 |
6 | A | 10.0 | 3 |
7 | B | 15.0 | 2 |
8 | C | 20.0 | 1 |
In [101]:
df.groupby('key').mean()
Out[101]:
data | data2 | |
---|---|---|
key | ||
A | 5.0 | 6.0 |
B | 12.5 | 5.0 |
C | 15.0 | 4.0 |