Code de scraping permettant de récupérer l'annonce et le numéro de téléphone depuis leboncoin.fr - menaciri/Scraping_project GitHub Wiki
`import requests from bs4 import BeautifulSoup import json import re import shadow_useragent from time import sleep from collections import defaultdict from itertools import cycle from random import randint import csv import socket
ua = shadow_useragent.ShadowUserAgent() my_user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
datadome = [] listeditem = []
for i in range(1, 10): sleep(randint(1, 10)) urlcooies = 'https://www.leboncoin.fr/ventes_immobilieres/demandes/p-' + str(i) + '/' try:
proxies = {
"https": "https://1.0.132.104:8080",
"http": "http://1.0.132.104:8080"
}
cookiesFF = {
'cikneeto_uuid': 'id:4eea2ef6-c9db-47e1-b84a-caa756ee42c7',
'saveOnboarding': '1',
'didomi_token': 'eyJ1c2VyX2lkIjoiMTcxYTc0MzUtNzI4NC02ZGMzLTg0YjUtMzczNDdkOTQzOWU4IiwiY3JlYXRlZCI6IjIwMjAtMDQtMjNUMTM6Mzk6MDAuNTQ3WiIsInVwZGF0ZWQiOiIyMDIwLTA0LTI4VDIxOjQ2OjEwLjEwMloiLCJ2ZW5kb3JzIjp7ImVuYWJsZWQiOlsiZ29vZ2xlIiwiYW1hem9uIiwiYzpjb2xsZWN0aXZlLWhoU1l0UlZuIiwiYzpyb2NreW91IiwiYzpwdWJvY2Vhbi1iNkJKTXRzZSIsImM6cnRhcmdldC1HZWZNVnlpQyIsImM6aWxsdW1hdGVjLUNodEVCNGVrIiwiYzpzY2hpYnN0ZWQtTVFQWGFxeWgiLCJjOmdyZWVuaG91c2UtUUtiR0JrczQiLCJjOnNwb3RpbSIsImM6cmVhbHplaXRnLWI2S0NreHlWIiwiYzp2aWRlby1tZWRpYS1ncm91cCIsImM6c3dpdGNoLWNvbmNlcHRzIiwiYzp0cmFkZXRyYWNrZXIiLCJjOmxlbW9tZWRpYS16YllocDJRYyIsImM6bnVnZ2FkIiwiYzpyZXRhcmdldGVyLWJlYWNvbiIsImM6eW9ybWVkaWFzLXFuQldoUXlTIiwiYzpwYXJzZWNtZWQtTVVXZ1VpYWkiLCJjOm1heXRyaWNzZy1BUzM1WWFtOSIsImM6Y2FibGF0b2xpLW5SbVZhd3AyIiwiYzpzYW5vbWEiLCJjOnR1cmJvIiwiYzpyYWR2ZXJ0aXMtU0pwYTI1SDgiLCJjOnF3ZXJ0aXplLXpkbmdFMmh4IiwiYzp2ZG9waWEiLCJjOm1vYmlsZWpvdS1rN2I3WXhQViIsImM6YWRpbW8tUGhVVm02RkUiLCJjOnRyYWRlZG91YmxlciIsImM6dmlkZW9sb2d5IiwiYzpwdWJsaXNoZXJzLWU3RUZkZ1JyIiwiYzpyZXZsaWZ0ZXItY1JwTW5wNXgiLCJjOmluZmVjdGlvdXMtbWVkaWEiLCJjOm1vYmFsb2dtYi1OQkR6aXpwRSIsImM6Y3VlYmlxaW5jLTZuM2J6Y0dtIiwiYzphZHJpbm9zcC1wVjM5M2UzZiIsImM6cmVzZWFyY2gtbm93IiwiYzp2dWJsZS1jTUNKVng0ZSIsImM6d2hlbmV2ZXJtLThWWWh3YjJQIiwiYzphZG1vdGlvbiIsImM6d29vYmkiLCJjOnNob3BzdHlsZS1mV0pLMkxpUCIsImM6dXBwcmdtYmgtNldqN2hzVUgiLCJjOmZvcnR2aXNpb24taWU2YlhUdzkiLCJjOnRoaXJkcHJlc2UtU3NLd21IVksiLCJjOmIyYm1lZGlhLXBRVEZneVdrIiwiYzpwdXJjaCIsImM6YWR1bml0eWwtbTJoa2t0eGoiLCJjOnNjaGlic3RlZC1DTWphTkNVUCIsImM6bGlmZXN0cmVldC1tZWRpYSIsImM6c25hcHVwcHRlLWRoNVlGN3diIiwiYzphZmZpbGluZXQiLCJjOmNlcmVicm9hZC1CYkdFS1F3NSIsImM6c3luYy1uNzRYUXByZyIsImM6cGl4YWxhdGUiLCJjOnB1cnBvc2VsYS0zdzRaZktLRCIsImM6bG90YWRhdGEtTUxMRmpnWmgiLCJjOmludG93b3dpbi1xYXp0NXRHaSIsImM6bGtxZGFkaXYtQ0hCcDdlV1QiLCJjOm9waW5hcnlnbS02aXJGejZ3UyJdLCJkaXNhYmxlZCI6W119LCJwdXJwb3NlcyI6eyJlbmFibGVkIjpbImNvb2tpZXMiLCJhZHZlcnRpc2luZ19wZXJzb25hbGl6YXRpb24iLCJhZF9kZWxpdmVyeSIsImNvbnRlbnRfcGVyc29uYWxpemF0aW9uIiwiYW5hbHl0aWNzIl0sImRpc2FibGVkIjpbXX19',
'euconsent': 'BOyUAiNOylmlVAHABBFRDG-AAAAvRr_7__7-_9_-_f__9uj3Or_v_f__32ccL59v_h_7v-_7fi_-1nV4u_1vft9yfk1-5ctDztp507iakivXmqdeb1v_nz3_9pxPr8k89r7337Ew_v8_v-b7BCON9IAAAAAA',
'sq': 'ca=11_s',
'_gcl_au': '1.1.1690661888.1589230138',
'uuid': '293389e7-542f-4887-8a79-60bd1f22d95e',
'atidvisitor': '%7B%22name%22%3A%22atidvisitor%22%2C%22val%22%3A%7B%22vrn%22%3A%22-562498--598455-%22%2C%22an%22%3A%22NaN%22%2C%22ac%22%3A%220%22%7D%2C%22options%22%3A%7B%22path%22%3A%22%2F%22%2C%22session%22%3A15724800%2C%22end%22%3A15724800%7D%7D',
'autopromo-mondial-relay': '1',
'abtest_user': '1',
'adview_clickmeter': 'search__listing__32__',
'datadome': 'DZhNcdBon67B~hwetVc-Glp0fq5or7SYmJfR32PjsG.GY6GewtS-YTf2JQBkQtUGEuUdjEDcZS9YegLlEkGMSTCT-59MkdSVN-ccG-uLYM',
}
headersf = {
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'Upgrade-Insecure-Requests': '1',
'User-Agent': my_user_agent,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-User': '?1',
'Sec-Fetch-Dest': 'document',
'Referer': 'https://www.leboncoin.fr/',
'Accept-Language': 'en,fr-FR;q=0.9,fr;q=0.8,en-US;q=0.7',
}
resc = requests.post(urlcooies, headers=headersf, cookies=cookiesFF)
cookieslbc = resc.cookies
h = cookieslbc.items()
for name, datadid in h:
datadome.append(datadid)
for datadomeid in datadome:
cookies = {
'cikneeto_uuid': '',
'saveOnboarding': '1',
'didomi_token': '',
'euconsent': '',
'sq': 'ca=11_s',
'_gcl_au': '',
'uuid': '293389e7-542f-4887-8a79-60bd1f22d95e',
'atidvisitor': '%7B%22name%22%3A%22atidvisitor%22%2C%22val%22%3A%7B%22vrn%22%3A%22-562498--598455-%22%2C%22an%22%3A%22NaN%22%2C%22ac%22%3A%220%22%7D%2C%22options%22%3A%7B%22path%22%3A%22%2F%22%2C%22session%22%3A15724800%2C%22end%22%3A15724800%7D%7D',
'autopromo-mondial-relay': '1',
'datadome': datadomeid,
}
headers = {
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'User-Agent': my_user_agent,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-User': '?1',
'Sec-Fetch-Dest': 'document',
'Accept-Language': 'en,fr-FR;q=0.9,fr;q=0.8,en-US;q=0.7',
}
url = 'https://www.leboncoin.fr/ventes_immobilieres/demandes/p-' + str(i) + '/'
sleep(randint(1, 3))
try:
res = requests.get(url, headers=headers, cookies=cookies)
if not (res.status_code == 200 or res.status_code == 404):
res.raise_for_status()
print(res)
elif res.status_code == 404:
print(str(res) + 'https://www.leboncoin.fr/ventes_immobilieres/demandes/p-' + str(i) + '/')
else:
print(str(res) + str(i))
soup = BeautifulSoup(res.text, 'html.parser')
select = soup.select('._358dQ > div > div:nth-child(1) > ul')
bloc = str(select[0])
bloclinks = bloc.replace('<ul data-reactid="348"> ', '')
bloclinks = bloclinks.replace('>Sauvegarder la recherche</span></button></div></ul>', '')
bloclist = bloclinks.split('<li ')
zonelinks = []
for z in bloclist:
z = str(z)
matchlinks = re.search('href="(.*)/" rel="', z)
zonelinks.append(matchlinks)
zonelinks = list(filter(None.__ne__, zonelinks))
column_names = ["listlink", "listlinkid"]
for row in zonelinks:
row = str(row)
row = row.replace('<re.Match object; span=(1044, 1093), match=', '')
row = row.replace('<re.Match object; span=(1056, 1105), match=', '')
row = row.replace("'href=", "")
row = row.replace('"', '')
listlink = row.replace(' rel=>', '')
listlink = 'https://www.leboncoin.fr' + listlink
listlinkid = listlink.replace('https://www.leboncoin.fr/ventes_immobilieres/', '')
listlinkid = listlinkid.replace('.htm/', '')
datapage = listlink + ',' + listlinkid
datapage = datapage.split(',')
column_names = cycle(column_names)
d = defaultdict(list)
for column, val in zip(column_names, datapage):
d[column].append(val)
for k, v in d.items():
linkandid = ('%s = %s' % (k, ', '.join(map(str, v))))
# print(listlink)
datadome2 = []
urlcooies1 = listlink
sleep(randint(1, 3))
try:
resc1 = requests.post(urlcooies1)
cookieslbc1 = resc1.cookies
b = cookieslbc1.items()
for name2, datadmid2 in b:
datadome2.append(datadmid2)
for datadomeid2 in datadome2:
cookies1 = {
'cikneeto_uuid': '',
'saveOnboarding': '1',
'didomi_token': '',
'euconsent': '',
'sq': 'ca=11_s',
'_gcl_au': '',
'datadome': '' + datadomeid2 + '',
}
headers1 = {
'Connection': 'keep-alive',
'Accept': 'application/json',
'User-Agent': my_user_agent,
'Content-Type': 'application/x-www-form-urlencoded',
'Origin': 'https://www.leboncoin.fr',
'Sec-Fetch-Site': 'same-site',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Dest': 'empty',
'Referer': listlink,
'Accept-Language': 'en,fr-FR;q=0.9,fr;q=0.8,en-US;q=0.7',
}
sleep(randint(1, 3))
try:
res1 = requests.get(listlink, headers=headers1, cookies=cookies1)
print(listlink)
doc = BeautifulSoup(res1.text, 'html.parser')
if not (res1.status_code == 200 or res1.status_code == 404):
res1.raise_for_status()
print(res1)
elif res1.status_code == 404:
print(str(res1) + " page " + listlink + " non ouverte")
else:
# print(str(res) +'Good job open link mehdi')
selectortelbloc = '.styles_button__2ps51[data-pub-id="clicknumero"]'
selectbloctel = doc.select(selectortelbloc)
print(str(res1) + " page ouverte")
elementscrap = []
if doc.select(selectortelbloc):
print("find Numéro de téléphone")
selectortitle = '#grid > div.styles_adDescription__2JVQQ > div.styles_Spotlight__3R1xQ > div > h1'
selectbloctitle = doc.select(selectortitle)
print(selectbloctitle)
title = selectbloctitle[0].getText()
print(title)
elementscrap.append(title)
print(elementscrap)
selectordate = '#grid > div.styles_adDescription__2JVQQ > div.styles_Spotlight__3R1xQ > div > div:nth-child(4) > p'
selectblocdate = doc.select(selectordate)
date = selectblocdate[0].getText()
elementscrap.append(str(date))
print(date)
selectorlocationbloc = '#grid > div.styles_adDescription__2JVQQ > div:nth-child(6) > h2'
selectbloclocation = doc.select(selectorlocationbloc)
location = selectbloclocation[0].getText()
print(location)
elementscrap.append(str(location))
selectoridentifiant = '#aside > div > div.styles_ActionBlock__2HWip.styles_noPadding__30CCK > div > div > div.styles_userContainer__3fdV0 > div > div > a > div'
if doc.select(selectoridentifiant):
selectblocidentifiant = doc.select(selectoridentifiant)
identifiant = selectblocidentifiant[0].getText()
print(identifiant)
elementscrap.append(identifiant)
else:
print("can't print identifiant")
cookies2 = {
'cikneeto_uuid': '',
'saveOnboarding': '1',
'didomi_token': '',
'euconsent': '',
'sq': '',
'_gcl_au': '',
'datadome': datadomeid2,
}
headers2 = {
'Connection': 'keep-alive',
'Accept': 'application/json',
'User-Agent': my_user_agent,
'Content-Type': 'application/x-www-form-urlencoded',
'Origin': 'https://www.leboncoin.fr',
'Sec-Fetch-Site': 'same-site',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Dest': 'empty',
'Referer': listlink,
'Accept-Language': 'en,fr-FR;q=0.9,fr;q=0.8,en-US;q=0.7',
}
# 1.1 Crawling & scraping URL blog Menu LInks
# requeter les données
# parser la requete
soup0 = BeautifulSoup(res1.text, 'html.parser')
selectbloc = soup0.select('script[id="__NEXT_DATA__"]')
scriptbloc = str(selectbloc)
keybloc = re.search('"KEY_JSON":"(.*)","AVAL":"', scriptbloc)
key = keybloc.group(1)
data = {
'app_id': 'leboncoin_web_utils',
'key': key,
'list_id': listlinkid,
'text': '1'
}
sleep(randint(1, 8))
try:
response = requests.post(
'https://api.leboncoin.fr/api/utils/phonenumber.json',
headers=headers2,
cookies=cookies2, data=data)
if not (response.status_code == 200 or response.status_code == 403):
response.raise_for_status()
print(response)
elif response.status_code == 403:
print(str(
response) + " " + listlink + " requête numéro de téléphone non ouverte")
else:
print(response)
# print(str(response) + 'Good job post Mehdi')
soup = BeautifulSoup(response.text, 'html.parser')
print(soup)
bloctel = str(soup)
statusrequest = json.loads(bloctel)["utils"]["status"]
if not (statusrequest == "OK" or statusrequest == "KO"):
print("The request status is : " + statusrequest)
elif statusrequest == "OK":
tel = json.loads(bloctel)["utils"]["phonenumber"]
elementscrap.append(tel)
print(tel)
print(elementscrap)
def WriteListToCSV(csv_file, csv_columns, data_list):
dataframe = []
try:
with open(dataframe.csv, 'w') as csvfile:
writer = csv.writer(dataframe.csv, dialect='excel', lineterminator='\n',quoting=csv.QUOTE_NONNUMERIC)
writer.writerow(csv_columns)
for line in elementscrap:
writer.writerow(line)
except IOError as err:
print("I/O error({0})".format(err))
return
else:
print("request numéro de téléphone echec")
except socket.gaierror:
print('ignoring failed phone lookup')
else:
print("not find numéro de téléphone")
except socket.gaierror:
print('ignoring failed get linklist lookup')
except socket.gaierror:
print('ignoring failed post linklist lookup')
except socket.gaierror:
print('ignoring failed link+p get lookup')
except socket.gaierror:
print('ignoring failed link+p post lookup')
`