I am trying to convert a table in HTML to a csv in Python. The table I am trying to extract is this one:
Dades de per&iacut
Solution 1:
Here's a way of doing it, it's probably not the nicest way but it works! You can read through the comments to figure out what the code is doing!
from bs4 import BeautifulSoup
import csv
#read the html
html =open("table.html").read()
soup = BeautifulSoup(html, 'html.parser')
# get the tablefrom html
table= soup.select_one("table.tblperiode")
# find allrowsrows= table.findAll('tr')
# strip the header fromrows
headers =rows[0]
header_text = []
# add the header text toarrayfor th in headers.findAll('th'):
header_text.append(th.text)
# init row text array
row_text_array = []
# loop through rowsandaddrow text toarrayforrowinrows[1:]:
row_text = []
# loop through the elements
for row_element in row.findAll(['th', 'td']):
# append the arraywith the elements inner text
row_text.append(row_element.text.replace('\n', '').strip())
# append the text arrayto the row text array
row_text_array.append(row_text)
# output csv
withopen("out.csv", "w") as f:
wr = csv.writer(f)
wr.writerow(header_text)
# loop through eachrowarrayfor row_text_single in row_text_array:
wr.writerow(row_text_single)
Solution 2:
With this script:
import csv
from bs4 import BeautifulSoup
html = open('table.html').read()
soup = BeautifulSoup(html, features='lxml')
table = soup.select_one('table.tblperiode')
rows = []
for i, table_row inenumerate(table.findAll('tr')):
if i > 0:
periode = [' '.join(table_row.findAll('th')[0].text.split())]
data = [x.text for x in table_row.findAll('td')]
rows.append(periode + data)
header = ['Periode', 'TM', 'TX', 'TN', 'HRM', 'PPT', 'VVM', 'DVM', 'VVX', 'PM', 'RS']
withopen('result.csv', 'w', newline='') as f:
w = csv.writer(f)
w.writerow(header)
w.writerows(rows)
I've managed to generate following CSV file on output:
import csv
from bs4 import BeautifulSoup
import pandas as pd
html = open('test.html').read()
soup = BeautifulSoup(html, features='lxml')
#Specify table name which you want to read.#Example: <table class="queryResults" border="0" cellspacing="1">
table = soup.select_one('table.queryResults')
defget_all_tables(soup):
return soup.find_all("table")
tbls = get_all_tables(soup)
for i, tablen inenumerate(tbls, start=1):
print(i)
print(tablen)
defget_table_headers(table):
headers = []
for th in table.find("tr").find_all("th"):
headers.append(th.text.strip())
return headers
head = get_table_headers(table)
#print(head)defget_table_rows(table):
rows = []
for tr in table.find_all("tr")[1:]:
cells = []
# grab all td tags in this table row
tds = tr.find_all("td")
iflen(tds) == 0:
# if no td tags, search for th tags# can be found especially in wikipedia tables below the table
ths = tr.find_all("th")
for th in ths:
cells.append(th.text.strip())
else:
# use regular td tagsfor td in tds:
cells.append(td.text.strip())
rows.append(cells)
return rows
table_rows = get_table_rows(table)
#print(table_rows)defsave_as_csv(table_name, headers, rows):
pd.DataFrame(rows, columns=headers).to_csv(f"{table_name}.csv")
save_as_csv("Test_table", head, table_rows)
Share
Post a Comment
for "Converting A HTML Table To A CSV In Python"
Post a Comment for "Converting A HTML Table To A CSV In Python"