Project description
- I made a simple app to display and let user download the scrapped data from ICE event( data is openly available) -Data is scrapped from ( There are a total of 31 pages. Enter the page number to download partially or enter 32 to get all data.
I plan to put the main code for scraping into model. Here is the code with explanation: class Ice(models.Model): page = models.IntegerField() # this is the user input field # exhibitors_list = [] def data_scrapping(self): data_list = [] # create an empity list to append the list of dictionary for i in range(1,int( # run a for loop to the page user input base_url = ''+str(i)+'&searchgroup=00000001-exhibitors' # concatenate three strings to form the base url url_request = requests.get(base_url) # Sends an HTTP GET request to the specified URL and stores the response in url_request soup_obj = BeautifulSoup(url_request.content,'html.parser') #Uses BeautifulSoup to parse the HTML content of the response. #Creates a BeautifulSoup object (soup_obj) for easier HTML parsing. content_exhibitors =soup_obj.find_all('h2', attrs={'class': 'm-exhibitors-list__items__item__header__title'}) content_stand = soup_obj.find_all('div',attrs={'class':'m-exhibitors-list__items__item__header__meta__stand'}) #Extracts data from HTML using BeautifulSoup's find_all method based on specified HTML tags and attributes for a,b in zip(content_exhibitors,content_stand):# Iterates through paired elements from content_exhibitors and content_stand using zip. text_a = a.find('a').getText().strip() text = b.getText().strip() text_b = text.split(':')[1].strip() #Extracts and cleans the data from content_stand by splitting on ':' and removing leading/trailing whitespaces. dic = {'Name':text_a,'Stand':text_b} #Creates a dictionary (dic) with keys 'Name' and 'Stand' and appends it to the data_list. data_list.append(dic) df=pd.DataFrame(data_list) # df.to_csv('ice2024_exhibitors_list.csv',index=False) return df
def home(request): df_html = None # Initialize df_html variable # Check if the request method is POST if request.method == 'POST': # Check if 'page' key is present in the POST data if 'page' in request.POST: # Create an instance of the Ice model with the user-provided page number data = Ice(page=request.POST['page']) # Save the data to the database # Scrape data and create a Pandas DataFrame scrapped_data = data.data_scrapping() # Create an HTTP response with CSV content response = HttpResponse(content_type='text/csv') response['Content-Disposition'] = 'attachment; filename="exhibitors_list.csv"' # Write the CSV data to the response scrapped_data.to_csv(path_or_buf=response) return response # Return the CSV file as an attachment # Check if 'page1' key is present in the POST data elif 'page1' in request.POST: # Create an instance of the Ice model with the user-provided page1 number data = Ice(page=request.POST['page1']) # Save the data to the database # Scrape data and create a Pandas DataFrame scrapped_data = data.data_scrapping() # Convert the DataFrame to HTML format df_html = scrapped_data.to_html(index=False) # Render the home.html template with the df_html variable return render(request, 'home.html', {'df_html': df_html})
<html> <body> <h1> Scrap the ICE2024 Exhibitors Data</h1> <p>Data is from <a href="">ICE 2024 Website</a> There are total 31 pages, enter the page number to download partially or enter 32 to get all data. </p> <br> <h2> Download the CSV File </h2> <form method="post"> {% csrf_token %} <input name="page" placeholder="download to page..."> <button method ='submit'>Download the File</button> </form> <br> <h2> Display the data</h2> <form method="post"> {% csrf_token %} <input name="page1" placeholder="display to page..."> <button method ='submit'>Display</button> </form> <br> {% if df_html %} {{ df_html|safe }} {% else %} <p>No data available</p> {% endif %} </body> </html>
spin up a new server and create a venv for the project.
cd /etc/systemd/system [Unit] Description=gunicorn socket [Socket] ListenStream=/run/ice.sock [Install]
cd /etc/systemd/system [Unit] Description=gunicorn daemon Requires=ice.socket [Service] User=ubuntu Group=www-data WorkingDirectory=/home/ubuntu/ice/icedata ExecStart= /home/ubuntu/ice/ice/bin/gunicorn \ --access-logfile - \ --workers 3 \ --bind unix:/run/ice.sock \ icepj.wsgi:application [Install]
server { listen 80; server_name; location = /favicon.ico { access_log off; log_not_found off; } location /static/ { root /home/ubuntu/ice/icedata/static; } location /media/ { root /home/ubuntu/ice/icedata/static; } location / { include proxy_params; proxy_pass http://unix:/run/ice.sock; }}
Set up CNAME ice2014data and configure the security set up
