Save a pandas dataframe as table in Image or pdf document with nice multi index display
10,052
The solution that work for me: With pandas >= 0.17 I installed pdflatex. I copied latex package such as booktabs.sty, geography.sty and pdflscape.sty
import pandas as pd
import os
import math
def save_summary_table_as_pdf(path_to_csv, path_to_output_folder):
pwd = os.getcwd()
df = pd.read_csv(path_to_csv, sep =',')
#data preparation
groupeddf = df.groupby('Cluster')
res = groupeddf.describe([0.05, 0.5, 0.95])
res.index.rename(['Cluster', 'Stats'], inplace=True)
res['cluster'] = res.index.get_level_values('Cluster')
res['stats'] = res.index.get_level_values('Stats')
populations = (res.iloc[(res.index.get_level_values('Stats') == 'count'), \
0].values).tolist()
res['population'] = [populations[i] for i in res.index.labels[0].values()]
total_pop = sum(populations)
res['frequency'] =(res['population']/total_pop).round(3)
res.set_index(['cluster', 'population','frequency', 'stats'], inplace=True)
res1 = res.iloc[(res.index.get_level_values('stats') == '5%') |
(res.index.get_level_values('stats') == 'mean') |
(res.index.get_level_values('stats') == '50%') |
(res.index.get_level_values('stats') == '95%')]
res1 = res1.round(2)
res1.rename(columns=lambda x: x.replace('_', ' '), inplace=True)
#latex
nbpages = int(math.ceil(res1.shape[0]*1.0/40))
templatetop = r'''\documentclass[a3paper, 5pt]{article}
\usepackage{booktabs}
\usepackage{pdflscape}
\usepackage[a4paper,bindingoffset=0.2in,%
left=0.25in,right=0.25in,top=1in,bottom=1in,%
footskip=.25in]{geometry}
\begin{document}
\begin{landscape}
\pagenumbering{gobble}
\oddsidemargin = 0pt
\hoffset = -0.25in
\topmargin = 1pt
\headheight = 0pt
\headsep = 0pt
'''
templatebottom = '''
\end{landscape}
\end{document}
'''
output_folder_path_abs = path_to_output_folder
output_tex = os.path.join(output_folder_path_abs,
"clustering_summary_table.tex")
with open(output_tex, "wb") as afile:
afile.write(templatetop +'\n')
for i in range(0, nbpages):
afile.write(res1.iloc[(i*40):((i+1)*40), :].to_latex() +'\n' +
"""\pagenumbering{gobble}""")
afile.write(templatebottom +'\n')
os.chdir(output_folder_path_abs)
os.system('pdflatex clustering_summary_table.tex')
os.chdir(pwd)
os.remove(output_tex)
os.remove(os.path.join(path_to_output_folder,
'clustering_summary_table.aux'))
os.remove(os.path.join(path_to_output_folder,
'clustering_summary_table.log'))
if __name__ == "__main__":
print 'begin generate pdf table about clustering'
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("path_to_csv")
parser.add_argument("outputfolder")
args = vars(parser.parse_args())
filedir = os.path.abspath(os.path.dirname(__file__))
output_folder_path_abs = os.path.abspath(args['outputfolder'])
input_folder_path_abs = os.path.abspath(args['path_to_csv'])
# copy the user package latex to the folder
os.system('scp '
+os.path.abspath(os.path.join(filedir, 'userpackagelatex/booktabs.sty'))+
' ' +output_folder_path_abs)
os.system('scp '
+os.path.abspath(os.path.join(filedir, 'userpackagelatex/geography.sty'))+
' ' +output_folder_path_abs)
os.system('scp '
+os.path.abspath(os.path.join(filedir, 'userpackagelatex/pdflscape.sty'))+
' ' +output_folder_path_abs)
save_summary_table_as_pdf(input_folder_path_abs, output_folder_path_abs)
os.remove(os.path.join(output_folder_path_abs, 'booktabs.sty'))
os.remove(os.path.join(output_folder_path_abs, 'geography.sty'))
os.remove(os.path.join(output_folder_path_abs, 'pdflscape.sty'))
Author by
Luce Philibert
Data lover/hater. Tech entousiast. Startup curious.
Updated on June 04, 2022Comments
-
Luce Philibert almost 2 years
I'm trying to include a data frame with multi-index in a report in pdf. I would like to have a nice table output.
I have found these 2 solutions:
pandas.df -> HTML -> pdf
import pandas as pd from IPython.display import HTML import pdfkit # df generation df = pd.read_csv(path_to_csv, sep =',') groupeddf = df.groupby('Cluster') res = groupeddf.describe([0.05, 0.5, 0.95]) res.index.rename(['Cluster', 'stats'], inplace=True) res['Cluster'] = res.index.get_level_values('Cluster') res['stats'] = res.index.get_level_values('stats') populations = (res.iloc[(res.index.get_level_values('stats') == 'count'), \ 0].values).tolist() res['population'] = [populations[i] for i in res.index.labels[0].values()] total_pop = sum(populations) res['frequency'] =(res['population']/total_pop).round(3) res.set_index(['Cluster', 'population','frequency', 'stats'], inplace=True) res1 = res.iloc[(res.index.get_level_values('stats') == '5%') | (res.index.get_level_values('stats') == 'mean') | (res.index.get_level_values('stats') == '50%') | (res.index.get_level_values('stats') == '95%')] res1 = res1.round(2) # saving the df h = HTML(res1.to_html()) my_file = open('test.html', 'w') my_file.write(h.data) my_file.close() options = { 'orientation': 'Landscape' } with open('test.html') as f: pdfkit.from_file(f, 'out.pdf', options=options)
But this has a dependence on
pdfkit
which make it difficult to us. That's why I am trying to use pandas.df -> tex -> pdf (as mentioned in Export a Pandas dataframe as a table image )import pandas as pd import os # df generation df = pd.read_csv(path_to_csv, sep =',') groupeddf = df.groupby('Cluster') res = groupeddf.describe([0.05, 0.5, 0.95]) res.index.rename(['Cluster', 'stats'], inplace=True) res['Cluster'] = res.index.get_level_values('Cluster') res['stats'] = res.index.get_level_values('stats') populations = (res.iloc[(res.index.get_level_values('stats') == 'count'), \ 0].values).tolist() res['population'] = [populations[i] for i in res.index.labels[0].values()] total_pop = sum(populations) res['frequency'] =(res['population']/total_pop).round(3) res.set_index(['Cluster', 'population','frequency', 'stats'], inplace=True) res1 = res.iloc[(res.index.get_level_values('stats') == '5%') | (res.index.get_level_values('stats') == 'mean') | (res.index.get_level_values('stats') == '50%') | (res.index.get_level_values('stats') == '95%')] res1 = res1.round(2) res1.rename(columns=lambda x: x.replace('_', ' '), inplace=True) #latex template = r'''\documentclass[preview]{{standalone}} \usepackage{{booktabs}} \begin{{document}} {} \end{{document}} ''' with open("outputfile.tex", "wb") as afile: afile.write(template.format(res1.to_latex())) os.system("pdflatex outputfile.tex")
However, I am not familiar with latex, and I get this error :
! LaTeX Error: File `standalone.cls' not found. Type X to quit or <RETURN> to proceed, or enter a new name. (Default extension: cls)
Any idea about the error or the standard way to do pandas.df -> pdf ?