Code de scraping permettant de récupérer l'annonce et le numéro de téléphone depuis leboncoin.fr - menaciri/Scraping_project GitHub Wiki

`import requests from bs4 import BeautifulSoup import json import re import shadow_useragent from time import sleep from collections import defaultdict from itertools import cycle from random import randint import csv import socket

ua = shadow_useragent.ShadowUserAgent() my_user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'

datadome = [] listeditem = []

1.1 Crawling & scraping URL blog Menu LInks

requeter les données

for i in range(1, 10): sleep(randint(1, 10)) urlcooies = 'https://www.leboncoin.fr/ventes_immobilieres/demandes/p-' + str(i) + '/' try:

    proxies = {
        "https": "https://1.0.132.104:8080",
        "http": "http://1.0.132.104:8080"
    }

    cookiesFF = {
        'cikneeto_uuid': 'id:4eea2ef6-c9db-47e1-b84a-caa756ee42c7',
        'saveOnboarding': '1',
        'didomi_token': 'eyJ1c2VyX2lkIjoiMTcxYTc0MzUtNzI4NC02ZGMzLTg0YjUtMzczNDdkOTQzOWU4IiwiY3JlYXRlZCI6IjIwMjAtMDQtMjNUMTM6Mzk6MDAuNTQ3WiIsInVwZGF0ZWQiOiIyMDIwLTA0LTI4VDIxOjQ2OjEwLjEwMloiLCJ2ZW5kb3JzIjp7ImVuYWJsZWQiOlsiZ29vZ2xlIiwiYW1hem9uIiwiYzpjb2xsZWN0aXZlLWhoU1l0UlZuIiwiYzpyb2NreW91IiwiYzpwdWJvY2Vhbi1iNkJKTXRzZSIsImM6cnRhcmdldC1HZWZNVnlpQyIsImM6aWxsdW1hdGVjLUNodEVCNGVrIiwiYzpzY2hpYnN0ZWQtTVFQWGFxeWgiLCJjOmdyZWVuaG91c2UtUUtiR0JrczQiLCJjOnNwb3RpbSIsImM6cmVhbHplaXRnLWI2S0NreHlWIiwiYzp2aWRlby1tZWRpYS1ncm91cCIsImM6c3dpdGNoLWNvbmNlcHRzIiwiYzp0cmFkZXRyYWNrZXIiLCJjOmxlbW9tZWRpYS16YllocDJRYyIsImM6bnVnZ2FkIiwiYzpyZXRhcmdldGVyLWJlYWNvbiIsImM6eW9ybWVkaWFzLXFuQldoUXlTIiwiYzpwYXJzZWNtZWQtTVVXZ1VpYWkiLCJjOm1heXRyaWNzZy1BUzM1WWFtOSIsImM6Y2FibGF0b2xpLW5SbVZhd3AyIiwiYzpzYW5vbWEiLCJjOnR1cmJvIiwiYzpyYWR2ZXJ0aXMtU0pwYTI1SDgiLCJjOnF3ZXJ0aXplLXpkbmdFMmh4IiwiYzp2ZG9waWEiLCJjOm1vYmlsZWpvdS1rN2I3WXhQViIsImM6YWRpbW8tUGhVVm02RkUiLCJjOnRyYWRlZG91YmxlciIsImM6dmlkZW9sb2d5IiwiYzpwdWJsaXNoZXJzLWU3RUZkZ1JyIiwiYzpyZXZsaWZ0ZXItY1JwTW5wNXgiLCJjOmluZmVjdGlvdXMtbWVkaWEiLCJjOm1vYmFsb2dtYi1OQkR6aXpwRSIsImM6Y3VlYmlxaW5jLTZuM2J6Y0dtIiwiYzphZHJpbm9zcC1wVjM5M2UzZiIsImM6cmVzZWFyY2gtbm93IiwiYzp2dWJsZS1jTUNKVng0ZSIsImM6d2hlbmV2ZXJtLThWWWh3YjJQIiwiYzphZG1vdGlvbiIsImM6d29vYmkiLCJjOnNob3BzdHlsZS1mV0pLMkxpUCIsImM6dXBwcmdtYmgtNldqN2hzVUgiLCJjOmZvcnR2aXNpb24taWU2YlhUdzkiLCJjOnRoaXJkcHJlc2UtU3NLd21IVksiLCJjOmIyYm1lZGlhLXBRVEZneVdrIiwiYzpwdXJjaCIsImM6YWR1bml0eWwtbTJoa2t0eGoiLCJjOnNjaGlic3RlZC1DTWphTkNVUCIsImM6bGlmZXN0cmVldC1tZWRpYSIsImM6c25hcHVwcHRlLWRoNVlGN3diIiwiYzphZmZpbGluZXQiLCJjOmNlcmVicm9hZC1CYkdFS1F3NSIsImM6c3luYy1uNzRYUXByZyIsImM6cGl4YWxhdGUiLCJjOnB1cnBvc2VsYS0zdzRaZktLRCIsImM6bG90YWRhdGEtTUxMRmpnWmgiLCJjOmludG93b3dpbi1xYXp0NXRHaSIsImM6bGtxZGFkaXYtQ0hCcDdlV1QiLCJjOm9waW5hcnlnbS02aXJGejZ3UyJdLCJkaXNhYmxlZCI6W119LCJwdXJwb3NlcyI6eyJlbmFibGVkIjpbImNvb2tpZXMiLCJhZHZlcnRpc2luZ19wZXJzb25hbGl6YXRpb24iLCJhZF9kZWxpdmVyeSIsImNvbnRlbnRfcGVyc29uYWxpemF0aW9uIiwiYW5hbHl0aWNzIl0sImRpc2FibGVkIjpbXX19',
        'euconsent': 'BOyUAiNOylmlVAHABBFRDG-AAAAvRr_7__7-_9_-_f__9uj3Or_v_f__32ccL59v_h_7v-_7fi_-1nV4u_1vft9yfk1-5ctDztp507iakivXmqdeb1v_nz3_9pxPr8k89r7337Ew_v8_v-b7BCON9IAAAAAA',
        'sq': 'ca=11_s',
        '_gcl_au': '1.1.1690661888.1589230138',
        'uuid': '293389e7-542f-4887-8a79-60bd1f22d95e',
        'atidvisitor': '%7B%22name%22%3A%22atidvisitor%22%2C%22val%22%3A%7B%22vrn%22%3A%22-562498--598455-%22%2C%22an%22%3A%22NaN%22%2C%22ac%22%3A%220%22%7D%2C%22options%22%3A%7B%22path%22%3A%22%2F%22%2C%22session%22%3A15724800%2C%22end%22%3A15724800%7D%7D',
        'autopromo-mondial-relay': '1',
        'abtest_user': '1',
        'adview_clickmeter': 'search__listing__32__',
        'datadome': 'DZhNcdBon67B~hwetVc-Glp0fq5or7SYmJfR32PjsG.GY6GewtS-YTf2JQBkQtUGEuUdjEDcZS9YegLlEkGMSTCT-59MkdSVN-ccG-uLYM',
    }

    headersf = {
        'Connection': 'keep-alive',
        'Cache-Control': 'max-age=0',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': my_user_agent,
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Sec-Fetch-Site': 'same-origin',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-User': '?1',
        'Sec-Fetch-Dest': 'document',
        'Referer': 'https://www.leboncoin.fr/',
        'Accept-Language': 'en,fr-FR;q=0.9,fr;q=0.8,en-US;q=0.7',
    }

    resc = requests.post(urlcooies, headers=headersf, cookies=cookiesFF)
    cookieslbc = resc.cookies
    h = cookieslbc.items()
    for name, datadid in h:
        datadome.append(datadid)
    for datadomeid in datadome:
        cookies = {
            'cikneeto_uuid': '',
            'saveOnboarding': '1',
            'didomi_token': '',
            'euconsent': '',
            'sq': 'ca=11_s',
            '_gcl_au': '',
            'uuid': '293389e7-542f-4887-8a79-60bd1f22d95e',
            'atidvisitor': '%7B%22name%22%3A%22atidvisitor%22%2C%22val%22%3A%7B%22vrn%22%3A%22-562498--598455-%22%2C%22an%22%3A%22NaN%22%2C%22ac%22%3A%220%22%7D%2C%22options%22%3A%7B%22path%22%3A%22%2F%22%2C%22session%22%3A15724800%2C%22end%22%3A15724800%7D%7D',
            'autopromo-mondial-relay': '1',
            'datadome': datadomeid,
        }

        headers = {
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': my_user_agent,
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'Sec-Fetch-Site': 'none',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-User': '?1',
            'Sec-Fetch-Dest': 'document',
            'Accept-Language': 'en,fr-FR;q=0.9,fr;q=0.8,en-US;q=0.7',
        }

        url = 'https://www.leboncoin.fr/ventes_immobilieres/demandes/p-' + str(i) + '/'
        sleep(randint(1, 3))
        try:
            res = requests.get(url, headers=headers, cookies=cookies)
            if not (res.status_code == 200 or res.status_code == 404):
                res.raise_for_status()
                print(res)

            elif res.status_code == 404:
                print(str(res) + 'https://www.leboncoin.fr/ventes_immobilieres/demandes/p-' + str(i) + '/')

            else:
                print(str(res) + str(i))
                soup = BeautifulSoup(res.text, 'html.parser')
                select = soup.select('._358dQ > div > div:nth-child(1) > ul')
                bloc = str(select[0])
                bloclinks = bloc.replace('<ul data-reactid="348"> ', '')
                bloclinks = bloclinks.replace('>Sauvegarder la recherche</span></button></div></ul>', '')
                bloclist = bloclinks.split('<li ')

                zonelinks = []
                for z in bloclist:
                    z = str(z)
                    matchlinks = re.search('href="(.*)/" rel="', z)
                    zonelinks.append(matchlinks)
                zonelinks = list(filter(None.__ne__, zonelinks))

                column_names = ["listlink", "listlinkid"]

                for row in zonelinks:
                    row = str(row)
                    row = row.replace('<re.Match object; span=(1044, 1093), match=', '')
                    row = row.replace('<re.Match object; span=(1056, 1105), match=', '')
                    row = row.replace("'href=", "")
                    row = row.replace('"', '')

                    listlink = row.replace(' rel=>', '')
                    listlink = 'https://www.leboncoin.fr' + listlink

                    listlinkid = listlink.replace('https://www.leboncoin.fr/ventes_immobilieres/', '')
                    listlinkid = listlinkid.replace('.htm/', '')
                    datapage = listlink + ',' + listlinkid

                    datapage = datapage.split(',')

                    column_names = cycle(column_names)

                    d = defaultdict(list)
                    for column, val in zip(column_names, datapage):
                        d[column].append(val)
                    for k, v in d.items():
                        linkandid = ('%s = %s' % (k, ', '.join(map(str, v))))
                    # print(listlink)
                    datadome2 = []
                    urlcooies1 = listlink
                    sleep(randint(1, 3))
                    try:
                        resc1 = requests.post(urlcooies1)
                        cookieslbc1 = resc1.cookies
                        b = cookieslbc1.items()
                        for name2, datadmid2 in b:
                            datadome2.append(datadmid2)
                        for datadomeid2 in datadome2:

                            cookies1 = {
                                'cikneeto_uuid': '',
                                'saveOnboarding': '1',
                                'didomi_token': '',
                                'euconsent': '',
                                'sq': 'ca=11_s',
                                '_gcl_au': '',
                                'datadome': '' + datadomeid2 + '',
                            }

                            headers1 = {
                                'Connection': 'keep-alive',
                                'Accept': 'application/json',
                                'User-Agent': my_user_agent,
                                'Content-Type': 'application/x-www-form-urlencoded',
                                'Origin': 'https://www.leboncoin.fr',
                                'Sec-Fetch-Site': 'same-site',
                                'Sec-Fetch-Mode': 'cors',
                                'Sec-Fetch-Dest': 'empty',
                                'Referer': listlink,
                                'Accept-Language': 'en,fr-FR;q=0.9,fr;q=0.8,en-US;q=0.7',
                            }
                            sleep(randint(1, 3))
                            try:
                                res1 = requests.get(listlink, headers=headers1, cookies=cookies1)
                                print(listlink)
                                doc = BeautifulSoup(res1.text, 'html.parser')

                                if not (res1.status_code == 200 or res1.status_code == 404):
                                    res1.raise_for_status()
                                    print(res1)

                                elif res1.status_code == 404:
                                    print(str(res1) + " page " + listlink + " non ouverte")

                                else:
                                    # print(str(res) +'Good job open link mehdi')
                                    selectortelbloc = '.styles_button__2ps51[data-pub-id="clicknumero"]'
                                    selectbloctel = doc.select(selectortelbloc)
                                    print(str(res1) + " page ouverte")

                                    elementscrap = []
                                    if doc.select(selectortelbloc):
                                        print("find Numéro de téléphone")

                                        selectortitle = '#grid > div.styles_adDescription__2JVQQ > div.styles_Spotlight__3R1xQ > div > h1'
                                        selectbloctitle = doc.select(selectortitle)
                                        print(selectbloctitle)
                                        title = selectbloctitle[0].getText()
                                        print(title)
                                        elementscrap.append(title)
                                        print(elementscrap)

                                        selectordate = '#grid > div.styles_adDescription__2JVQQ > div.styles_Spotlight__3R1xQ > div > div:nth-child(4) > p'
                                        selectblocdate = doc.select(selectordate)
                                        date = selectblocdate[0].getText()
                                        elementscrap.append(str(date))

                                        print(date)

                                        selectorlocationbloc = '#grid > div.styles_adDescription__2JVQQ > div:nth-child(6) > h2'
                                        selectbloclocation = doc.select(selectorlocationbloc)
                                        location = selectbloclocation[0].getText()
                                        print(location)
                                        elementscrap.append(str(location))

                                        selectoridentifiant = '#aside > div > div.styles_ActionBlock__2HWip.styles_noPadding__30CCK > div > div > div.styles_userContainer__3fdV0 > div > div > a > div'
                                        if doc.select(selectoridentifiant):
                                            selectblocidentifiant = doc.select(selectoridentifiant)
                                            identifiant = selectblocidentifiant[0].getText()
                                            print(identifiant)
                                            elementscrap.append(identifiant)

                                        else:
                                            print("can't print identifiant")

                                            cookies2 = {
                                                'cikneeto_uuid': '',
                                                'saveOnboarding': '1',
                                                'didomi_token': '',
                                                'euconsent': '',
                                                'sq': '',
                                                '_gcl_au': '',
                                                'datadome': datadomeid2,
                                            }

                                            headers2 = {
                                                'Connection': 'keep-alive',
                                                'Accept': 'application/json',
                                                'User-Agent': my_user_agent,
                                                'Content-Type': 'application/x-www-form-urlencoded',
                                                'Origin': 'https://www.leboncoin.fr',
                                                'Sec-Fetch-Site': 'same-site',
                                                'Sec-Fetch-Mode': 'cors',
                                                'Sec-Fetch-Dest': 'empty',
                                                'Referer': listlink,
                                                'Accept-Language': 'en,fr-FR;q=0.9,fr;q=0.8,en-US;q=0.7',
                                            }

                                            # 1.1 Crawling & scraping URL blog Menu LInks
                                            # requeter les données
                                            # parser la requete
                                            soup0 = BeautifulSoup(res1.text, 'html.parser')
                                            selectbloc = soup0.select('script[id="__NEXT_DATA__"]')
                                            scriptbloc = str(selectbloc)
                                            keybloc = re.search('"KEY_JSON":"(.*)","AVAL":"', scriptbloc)
                                            key = keybloc.group(1)

                                            data = {
                                                'app_id': 'leboncoin_web_utils',
                                                'key': key,
                                                'list_id': listlinkid,
                                                'text': '1'
                                            }
                                            sleep(randint(1, 8))
                                            try:
                                                response = requests.post(
                                                    'https://api.leboncoin.fr/api/utils/phonenumber.json',
                                                    headers=headers2,
                                                    cookies=cookies2, data=data)
                                                if not (response.status_code == 200 or response.status_code == 403):
                                                    response.raise_for_status()
                                                    print(response)

                                                elif response.status_code == 403:
                                                    print(str(
                                                        response) + " " + listlink + " requête numéro de téléphone non ouverte")

                                                else:
                                                    print(response)

                                                    # print(str(response) + 'Good job post Mehdi')
                                                    soup = BeautifulSoup(response.text, 'html.parser')
                                                    print(soup)
                                                    bloctel = str(soup)
                                                    statusrequest = json.loads(bloctel)["utils"]["status"]

                                                    if not (statusrequest == "OK" or statusrequest == "KO"):
                                                        print("The request status is : " + statusrequest)
                                                    elif statusrequest == "OK":
                                                        tel = json.loads(bloctel)["utils"]["phonenumber"]
                                                        elementscrap.append(tel)
                                                        print(tel)
                                                        print(elementscrap)


                                                        def WriteListToCSV(csv_file, csv_columns, data_list):
                                                            dataframe = []
                                                            try:
                                                                with open(dataframe.csv, 'w') as csvfile:
                                                                    writer = csv.writer(dataframe.csv, dialect='excel', lineterminator='\n',quoting=csv.QUOTE_NONNUMERIC)
                                                                    writer.writerow(csv_columns)
                                                                    for line in elementscrap:
                                                                        writer.writerow(line)
                                                            except IOError as err:
                                                                print("I/O error({0})".format(err))
                                                            return
                                                    else:
                                                        print("request numéro de téléphone echec")
                                            except socket.gaierror:
                                                print('ignoring failed phone lookup')
                                    else:
                                        print("not find numéro de téléphone")
                            except socket.gaierror:
                                print('ignoring failed get linklist lookup')
                    except socket.gaierror:
                        print('ignoring failed post linklist lookup')
        except socket.gaierror:
            print('ignoring failed link+p get lookup')
except socket.gaierror:
    print('ignoring failed link+p post lookup')

`

⚠️ **GitHub.com Fallback** ⚠️