Getting 403 error despite having headers in my code
I’ve written code to scrape weedmaps.com and get data for deliveries, doctors, and dispensaries that supply medical marijuana. The number of listing page urls for each state varies depending on the number of providers they have. Each page has only 100 listings, so I’ve written a loop that tells the crawler to scrape through the nth page url (i.e., Colorado has 882 listings and should loop up to page=9 (see code below for reference).
When I run my code, it’s able to scrape page 1, but as soon as it hits page 2, I get a 403 error despite having headers (my best guess is that the site has strong defenses built in to fight web-crawling; I had to figure out the code by finding the hidden API using the browser developer tools). I don’t think the issue is rate limiting, because I’ve played around before with changing the number of seconds in between API calls with time.sleep, and I still run into this problem. I think there are other hidden parameters that I need to include so that I can actually run my webcrawler, but am not sure what’s the best way to identify those. How do I get around this?
import csv import requests import json import urllib3 import urllib.request import pandas as pd import time from time import sleep from urllib.request import Request, urlopen state_list = ['colorado'] '''state_list = ['alabama', 'alaska', 'arizona', 'arkansas', 'california', 'colorado', 'connecticut', 'delaware', 'florida', 'georgia', 'hawaii', 'idaho', 'illinois', 'indiana', 'iowa', 'kansas', 'kentucky', 'louisiana', 'maine', 'maryland', 'massachusetts', 'michigan', 'minnesota', 'mississippi', 'missouri', 'montana', 'nebraska', 'nevada', 'new-hampshire', 'new-jersey', 'new-mexico', 'new-york', 'north-carolina', 'north-dakota', 'ohio', 'oklahoma', 'oregon', 'pennsylvania', 'rhode-island', 'south-carolina', 'south-dakota', 'tennessee', 'texas', 'utah', 'vermont', 'virginia', 'washington', 'west-virginia', 'wisconsin', 'wyoming', 'puerto-rico']''' json_data = [] for state in state_list: print("Starting on", state) page1url = "https://api-g.weedmaps.com/discovery/v1/listings?page_size=100&size=100&filter%5Bregion_slug%5Bdoctors%5D%5D={0}&filter%5Bregion_slug%5Bdispensaries%5D%5D={0}&filter%5Bregion_slug%5Bdeliveries%5D%5D={0}&filter%5Bplural_types%5D%5B%5D=doctors&filter%5Bplural_types%5D%5B%5D=dispensaries&filter%5Bplural_types%5D%5B%5D=deliveries&page=1".format(state) headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36'} req1 = Request(url=page1url, headers=headers) data1 = urlopen(req1).read() js1 = json.loads(data1) num_listings = js1["meta"]["total_listings"] print(num_listings) if num_listings % 100 >= 1: state_pages = (num_listings//100) + 1 remainder = num_listings % 100 print(state_pages, "pages to scrape through and", remainder, "listings on the last page") else: state_pages = num_listings / 100 print(state_pages, "pages to scrape through and no remainders on the last page") for x in range(1, state_pages): starturl = "https://api-g.weedmaps.com/discovery/v1/listings?page_size=100&size=100&filter%5Bregion_slug%5Bdoctors%5D%5D={0}&filter%5Bregion_slug%5Bdispensaries%5D%5D={0}&filter%5Bregion_slug%5Bdeliveries%5D%5D={0}&filter%5Bplural_types%5D%5B%5D=doctors&filter%5Bplural_types%5D%5B%5D=dispensaries&filter%5Bplural_types%5D%5B%5D=deliveries&page=".format(state) url = starturl + str(x) urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) time.sleep(5) req = Request(url=url, headers=headers) print(url) data = urlopen(req).read() try: js = json.loads(data) except: js = None print(state,"fail") break for numb in range(100): weedmaps_dict={"id_":"NaN","wmid":"NaN","business":"NaN", "state":"NaN", "city":"NaN", "type_desc":"NaN", "web_url":"NaN","license_type":"NaN", "address":"NaN", "zip_code":"NaN", "timezone":"NaN"} id_ = js["data"]["listings"][numb]["id"] wmid = js["data"]["listings"][numb]["wmid"] business = js["data"]["listings"][numb]["name"].encode("utf-8") state = js["data"]["listings"][numb]["state"].encode("utf-8") city = js["data"]["listings"][numb]["city"].encode("utf-8") type_desc = js["data"]["listings"][numb]["type"].encode("utf-8") web_url = js["data"]["listings"][numb]["web_url"].encode("utf-8") license_type = js["data"]["listings"][numb]["license_type"].encode("utf-8") address = js["data"]["listings"][numb]["address"].encode("utf-8") zip_code = js["data"]["listings"][numb]["zip_code"].encode("utf-8") timezone = js["data"]["listings"][numb]["timezone"].encode("utf-8") if id_ is not None: weedmaps_dict["id_"] = id_ if wmid is not None: weedmaps_dict["wmid"] = wmid if business is not None: weedmaps_dict["business"] = business if state is not None: weedmaps_dict["state"] = state if city is not None: weedmaps_dict["city"] = city if type_desc is not None: weedmaps_dict["type_desc"]=type_desc if web_url is not None: weedmaps_dict["web_url"] = web_url print(business, city, web_url) if license_type is not None: weedmaps_dict["license_type"] = license_type if address is not None: weedmaps_dict["address"]=address if zip_code is not None: weedmaps_dict["zip_code"] = zip_code if timezone is not None: weedmaps_dict["timezone"] = timezone json_data.append(weedmaps_dict) print(state, "page", x, " done") df = pd.DataFrame(json_data) df.to_csv("weedmaps2020q2.csv") print("Saved")