For one of my project I needed to do web-scrapping and collect images from websites.
Here is a code I use to collect it :
import requests
from bs4 import BeautifulSoup
from os.path import basename
import os
import sys
import urllib
import urllib2
import urlparse
import argparse
import json
import config
import random
import base64
import datetime
import time
import string
from azure.storage import CloudStorageAccount, AccessPolicy
from azure.storage.blob import BlockBlobService, PageBlobService, AppendBlobService
from azure.storage.models import CorsRule, Logging, Metrics, RetentionPolicy, ResourceTypes, AccountPermissions
from azure.storage.blob.models import BlobBlock, ContainerPermissions, ContentSettings
#from azure.storage.blob import BlobService
from azure.storage import *
#from azure.storage.blob.blobservice import BlobService
CURRENT_DIR = os.getcwd()
STORING_DIRECTORY_NAME = "stroage_scrapped_images"
STORING_DIRECTORY = CURRENT_DIR+"/"+STORING_DIRECTORY_NAME
if not os.path.exists(STORING_DIRECTORY):
os.makedirs(STORING_DIRECTORY)
def randomword(length):
letters = string.ascii_lowercase
return ''.join(random.choice(letters) for i in range(length))
startdate = time.clock()
metadata_loaded = {'Owner': 'ToBeAddedSoon', 'Date_Of_Upload': startdate, 'VAR_2': 'VAL_VAR_2','VAR_3': 'VAL_VAR_3','VAR_4': 'VAL_VAR_4'}
with open("credentials.json", 'r') as f:
data = json.loads(f.read())
StoAcc_var_name = data["storagacc"]["Accountname"]
StoAcc_var_key = data["storagacc"]["AccountKey"]
StoAcc_var_container = data["storagacc"]["Container"]
#print StoAcc_var_name, StoAcc_var_key, StoAcc_var_container
def copy_azure_files(source_url,destination_object,destination_container):
blob_service = BlockBlobService(account_name=StoAcc_var_name, account_key=StoAcc_var_key)
blob_service.copy_blob(destination_container, destination_object, source_url)
block_blob_service = BlockBlobService(account_name=StoAcc_var_name, account_key=StoAcc_var_key)
def upload_func(container,blobname,filename):
start = time.clock()
block_blob_service.create_blob_from_path(
container,
blobname,
filename)
elapsed = time.clock()
elapsed = elapsed - start
print "*** DEBUG *** Time spent uploading API " , filename , " is : " , elapsed , " in Bucket/container : " , container
#URL_TARGET = "https://mouradcloud.westeurope.cloudapp.azure.com/blog/blog/category/food/"
URL_TARGET = "https://www.cdiscount.com/search/10/telephone.html"
base_url = URL_TARGET
out_folder = '/tmp'
r = requests.get(URL_TARGET)
data = r.text
soup = BeautifulSoup(data, "lxml")
for link in soup.find_all('img'):
src = link
image_url = link.get("src")
while image_url is not None :
if 'http' in image_url:
blocks = []
if image_url.endswith(('.png', '.jpg', '.jpeg')):
print " ->>>>>>>>>>>>>> THIS IS AN IMAGE ... PROCESSING "
file_name_downloaded = basename(image_url)
file_name_path_local = STORING_DIRECTORY+"/"+file_name_downloaded
with open(file_name_path_local, "wb") as f:
f.write(requests.get(image_url).content)
filename_in_clouddir="uploads"+"/"+file_name_downloaded
#upload_func(StoAcc_var_container,filename_in_clouddir,file_name_path_local)
copy_azure_files(image_url,filename_in_clouddir,StoAcc_var_container)
break
else :
print " ->>>>>>>>>>>>>> THIS NOT AN IMAGE ... SKIPPING "
break
else :
print " ->>>>>>>>>>>>>> THIS IS A LOCAL IMAGE ... SKIPPING "
break
continue
The credential files look likes this :
{
"DC_LOCATION": "XXXXXXXXXXXXXXXXXXXXX",
"GROUP_NAME": "XXXXXXXXXXXXXXXXXXXXX",
"storagacc": {
"Accountname": "XXXXXXXXXXXXXXXXXXXXX",
"AccountKey": "XXXXXXXXXXXXXXXXXXXXX",
"Container": "XXXXXXXXXXXXXXXXXXXXX",
"AZURE_CLIENT_SECRET": "XXXXXXXXXXXXXXXXXXXXX=",
"AZURE_SUBSCRIPTION_ID": "XXXXXXX-XXXXXXX-XXXXXXX-XXXXXXX-XXXXXXX"
}
}