aPandaStepAhead - TeamFlowerPower/kb GitHub Wiki
import pandas
# import numpy # for (4), but this is inefficient anywaytableColumns = ["audioCodec", "displayHeight", "testName"]
testFrame = pandas.DataFrame(columns=tableColumns)print empty frame (only column names)
print(testFrame)testFrame.loc[len(testFrame.index)] = [1,2,3]i = 0
for i in range(20):
testFrame.at[len(testFrame)] = [i, i+1, i+2]
testFrame.at[len(testFrame)] = [i, i+1, "df"]
testFrame.at[len(testFrame)] = [chr(i+266), i+1, i+2]newData = [4,5,6]
testFrame = testFrame.append(pandas.Series(newData), ignore_index=True)This is very inefficient
newLine = pandas.DataFrame([1,2,3],None, columns=tableColumns)
newLine = pandas.DataFrame(numpy.array([1,2,3]), columns=tableColumns)
testFrame.append(newLine, ignore_index=True)myDataFrame["diff_A_B"] = datfr["A"] - otherData["B"] # add columns `diff_A_B` from columns of two other `pandas.DataFrames`testFrame = testFrame.TdataFrame = dataFrame.drop("columnName", 1) # 0: rows, 1 columnsNote: ["cake", "alcohol"] is a list.
df1 = df[["cake", "alcohol"]] # keep only "cake" and "alcohol" in df1BUT: df[["a","b"]] produces a copy :(
frames = [df1, df2, df3]
result = pd.concat(frames) # add list of dataframes you want to concatenateor like this:
result = pd.concat([df1, df4], ignore_index=True) # `ignore_index`: discards initial counting >> if `True` count is number of rows -1Here we want to merge on "configID" and "memType" (on argument is not nesessary here since the default searches for matching columns):
# Make sure the indices are reset for both dataframes (reset with `df = df.reset_index()`)
groupedByMemTypeAcc = pandas.merge(groupedByMemTypeAcc, budgetsData, on=["configID", "memType"])
groupedByMemTypeAcc = groupedByMemTypeAcc.groupby(["configID", "memType"]).sum()
print(groupedByMemTypeAcc)Input:
# groupedByMemTypeAcc
configID memType sizeDec
0 micro1 INT_FLASH 17196646
1 micro1 INT_RAM 15302
2 micro2 EXT_FLASH 44667659878
3 micro2 EXT_RAM 386547056
4 micro2 INT_RAM 3276
# budgetData
configID memType budget
0 micro1 INT_FLASH 20971520
1 micro1 INT_RAM 16384
2 micro1 EXT_FLASH 0
3 micro1 EXT_RAM 0
4 micro2 EXT_FLASH 68719476736
5 micro2 EXT_RAM 536870912
6 micro2 INT_RAM 65536
7 micro2 INT_FLASH 0
Output:
sizeDec budget
configID memType
micro1 INT_FLASH 17196646 20971520
INT_RAM 15302 16384
micro2 EXT_FLASH 44667659878 68719476736
EXT_RAM 386547056 536870912
INT_RAM 3276 65536
dataFrame = dataFrame.reset_index()dataFrame.loc["partyHard"].replace(to_replace="nerds", value="hardcorePartyPeople")dataFrame.iloc[1, 1].replace(to_replace="nerds", value="hardcorePartyPeople") # .iloc[columns, row]Get only rows where values above 0 in "A" exists.
The operators are: | for or, & for and, and ~ for not. These must be grouped by using parentheses (e.g. s[(s < -1) | (s > 0.5)])
df = df[df["A"] > 0]print(df)
# one two
# 0 1 2
# 1 3 4
# 2 5 6
# 3 7 8
df.set_index("two")
searchedValue = 6.6
# Get nearest index of column `two`
idx = testFrame.index[testFrame.index.get_loc(searchedValue, method='nearest')]
# Print nearest index of `searchedValue` at column two
print("Closest match:", df["two"][idx])
# Closest match: 8grouped = dataFrame.groupby("Company")Here column "Company" becomes automatically the index (before it was the internal numbering >> see index_col=0 in section ".csv files". Avoid this by appending .reset_index().
Groups by CompanyName and Employees and sums everything up.
groupedByMemType = groupedByMemType.groupby(["CompanyName", "Employees"]).sum()Apply the median for every group item
dbAggregated = grouped.aggregate(numpy.median)for key, item in grouped:
# do stuff with key & itemtimeit df.shape-> 1000000 loops, best of 3: 1.17 us per loop
timeit df[0].count()-> 10000 loops, best of 3: 56 us per loop
timeit len(df.index)-> 1000000 loops, best of 3: 381 ns per loop >> most intuitive and fastest :)
-> len(df.index) and df[0].count() are not interchangeable: count excludes NaNs, which is probably explains why it is slower
We want to achieve the following (A, B: input data; dA: vertical delta):
A B dA
0 a b (a-c)
1 c d (c-e)
2 e f (e-g)
3 g h Nan
You can use .shift to shift a whole column by one (or more) places.
df['dA'] = df['A'] - df['A'].shift(-1)Gets column "video" (containing paths), split by backslashes (note: you need .str for pandas.DataFrame string modifications) and take last element (= file name):
self.dataFrame["video"] = self.dataFrame["video"].str.split("\\").str[-1]...
testFrame.to_csv("blah.csv")anotherFrame = pandas.read_csv("blah.csv", index_col=0) # say that column[0] should be use for internal indexing (otherwise a new one is added)This format option only works for floats but not for ints
pandas.options.display.float_format = '{:14,.0f}'.format
groupedByMemType["sizeDec"] = groupedByMemType["sizeDec"].astype(float) # Change data type of one column (do this only if necessary)
print(groupedByMemType)Print the data type of one pandas.DataFrame-column:
print(groupedByMemType["sizeDec"].dtype)If index is set: x=index; you can use .reset_index() or .set_index("desiredColumn")
Essentially pandas uses matplotlib
import pandas
import matplotlib
import matplotlib.pyplot as plt # needs to be imported explicitely
import numpy
# ...do stuff
dataFrame = ...
matplotlib.style.use('ggplot') # use nice style for plotting
dataFrame.plot() # plots everything in dataframe x=indexdataFrame.plot() # plots everything in dataframe x=index
dataFrame.plot.area() # do area plot
dataFrame.plot(kind=area) # do area plot
plt.show() # show plotdataFrame.plot()
anotherFrame.plot()
# ...
plt.show() # or save- Must be same type of plot
- Plots have to have the same lenth (in x direction)
plot = anotherFrame.plot() # Data to plotmatplotlib.pyplot.savefig("fileName.png", dpi=200) # dpi: optionalmyFigure = plot.get_figure()
myFigure.savefig("myBeautifulFigure.png", # Rest is optional
dpi=300,
transparent=True # Make background colour transparent
)-> https://stackoverflow.com/questions/7986567/matplotlib-how-to-set-the-current-figure
Example data to plot
import matplotlib
import matplotlib.pyplot as plt
# Plot bar graph
barGraph = groupedByMemTypeAcc[[headerUsed, headerAvailable]].plot.bar(stacked=True, figsize=(18, 7), rot=45, title=titleStr)figWidth = 20
figHeight = 10
plt.rcParams['figure.figsize'] = [figWidth, figHeight]plt.subplots_adjust(top=0.9, bottom=0.26, left=0.06, right=0.98) # Adjust space around our plotplt.legend(loc='lower right', bbox_to_anchor=(0.98, -0.4)) # Placing legend below the plot (bottom right corner)Available line styles: https://matplotlib.org/gallery/lines_bars_and_markers/line_styles_reference.html
df.plot(kind='line', style=['-', '--', '-.'])
# `kind` can be omitted
# `style`: one list item per column (do not count index)
# You can prefix the line patterns by specific colours like: `df.plot(kind='line', style=['b-', 'y--', 'g-.'])`
# See also: https://matplotlib.org/users/colors.html
# Ajusting data points in plots is similar: `df.plot(kind='line', style=['bo-', 'ys--', 'g^-.'])`
# circles-----^ ^-squares ^--triangles
# See also: https://stackoverflow.com/a/42820790/4773274Since we have in this case a stacked bar graph we need to use an if-statement in order to treat the two bar types seperately
for i, bar in enumerate(barGraph.patches):
if i >= len(barGraph.patches) / 2:
# Show budgets in kiB
barGraph.annotate(
"{:.0f} kiB".format(groupedByMemTypeAcc["budget"][i % (len(barGraph.patches) / 2)] / 1024),
xy=(bar.get_x(), (bar.get_height() + barGraph.patches[int(i % (len(barGraph.patches) / 2))].get_height())),
color="#030303")
else:
# Show percentage and value in kiB
barGraph.annotate(
"{:.1f} %\n{:.0f} kiB".format(bar.get_height(), groupedByMemTypeAcc["sizeDec"][i] / 1024),
xy=(bar.get_x(), bar.get_height() + 0.01),
color="#f2ece9") # see: https://matplotlib.org/api/_as_gen/matplotlib.axes.Axes.annotate.html#matplotlib.axes.Axes.annotateOften for bar.get_height() + 0.01 a multiplication is used by some guys but that did not work for me since it accumulates the values somehow (I don't know what this happens).
dbAgg.plot.scatter(x="sec", y="averageTwoPercentQPnonI", c="bitstream_stats", s=100)- s: dotsize
- c: if column name: adds another scale with colours the points according to its values
TODO: how to change colour of c
