In this summer vacation, I was become a intern engineer in a internet service company. In fact, our company is an eshops price comparison platform. Here is the website link:https://biggo.com.tw/ . They crawler the products information in different eshops like Amazon, PChome etc. Build a SQL system store the data. Then run a browser engine website to present the price comparision results. Its service area including Taiwan, Tailand, Singapore. Now they want to push their business into Japan. So the job most I do is Japan eshops website crawler.

So I want to write an article to record the crawler code. The company always store the data by a type called “pickle”. And we using the python to do the crawler jobs. The most part of job is running under the crawler, but we let the request job in another .py file. So we need to build two files into the same path. Then running the cralwer.py to execute.
1.The Crawler
Import modules
import urllib.parse
import os
import re
import argparse
import logging
import time
import pickle
from bs4 import BeautifulSoup
from requesturl import requesturl
To put it briefly, our job is finding the category pages. Using the category branch to craw merchandise information respectively. Then write a auto jump page code to collect all of products under the branch.
logger = logging.getLogger('avac')
class Crawler:
site = 'jp_ec_avac'
def collect_cat(self):
baseurl = 'https://www.avac.co.jp/'
doc = self.uc.reachurl(baseurl)
sp = BeautifulSoup(doc, 'lxml')
cats = set()
for a in sp.select('td.side_text ul.level1 li.level1')[0:48]:
a1 = a.select_one('a')
url = 'https://www.avac.co.jp'+a1.get('href')
if url in url:
cats.add(url)
return list(cats)
def geturl(self,url):
items = []
nextpage = ""
doc = self.uc.reachurl(url)
sp = BeautifulSoup(doc,'lxml')
link = sp.select('span.navi a')
if len(link) > 0:
a = link[len(link)-1]
if '次へ' in a.get_text():
href = a.get('href')
nextpage ='https://www.avac.co.jp/products/list.php'+href
else:
nextpage = ""
bigblock = sp.select('div.list_area.clearfix')
if bigblock:
for block in bigblock:
img = block.select_one('div.listphoto a img')
price_tag = block.select_one('span.price strong')
name_block = block.select_one('h4.text-right a')
if img:
title = img.get("alt")
image = "https://www.avac.co.jp"+img.get('src')
else:
logger.warning('Wrong cant find title or url: %s', img)
continue
if price_tag:
price = price_tag.text.strip()
else:
logger.warning('Wrong cant find img: %s', price_tag)
continue
if name_block:
url = "https://www.avac.co.jp"+name_block.get('href')
iid = url.split("=")[1]
else:
logger.warning('Wrong cant find price: %s', name_block)
continue
if img and price_tag and title and url:
item = {}
item['iid'] = iid
item['title'] = title
item['url'] = url
item['image'] = image
item['provide'] = 'avac'
item['providename'] = ['avac']
item['price'] = self.getint(price)
if len(item['iid']) > 1 and len(item['title']) > 2 and item['image'].startswith('http') and item['price'] > 0 and len(item['url']) > 10:
items.append(item)
else:
logger.warning('Wrong item: %s', url)
print(item)
continue
else:
logger.warning('Wrong info: %s', url)
continue
return items, nextpage
else:
logger.warning('Wrong cant not find items')
return items, nextpage
def __init__(self, pilimit=1000):
self.pilimit = pilimit
self.keyset = set()
self.item_buf = []
self.uc = requesturl()
def getint(self,strtext):
intstr = re.sub("[^0-9.]", "", strtext)
if len(intstr) > 0:
return int(float(intstr))
else:
return 0
def prepare_dump(self):
home = '.'
dirname = os.path.join(home, 'var', 'item_buf')
#dirname = os.path.join(os.environ['HOMEPATH'], 'var', 'item_buf')
os.makedirs(dirname, exist_ok=True)
ts = time.time()
filename = os.path.join(dirname, '{}.{}.pickle'.format(self.site, ts))
return filename
def syncbuf(self):
if len(self.item_buf) > 0:
output_file = self.prepare_dump()
to_dump = {
'site': self.site,
'buffer': self.item_buf
}
with open(output_file, 'wb') as fd:
pickle.dump(to_dump, fd)
self.item_buf = []
def spamcheck(self, item):
chk = False
if item['iid'] not in self.keyset:
chk = True
self.keyset.add(item['iid'])
return chk
def loopurl(self,it):
searched = 0
inserted = 0
searchurl = it
pi = 1
while pi < self.pilimit:
newitem = []
newitem, searchurl = self.geturl(searchurl)
if len(newitem) > 0:
lop = 0
for item in newitem:
if self.spamcheck(item):
lop += 1
self.item_buf.append(item)
if lop == 0:
break
inserted += lop
searched += len(newitem)
if len(self.item_buf) > 1000:
self.syncbuf()
else:
break
if len(searchurl) < 1:
break
pi += 1
logger.info('%s pi: %s, searched: %s, inserted: %s', it, pi, searched, inserted)
self.state(inserted,a)
return True
def state(self,inserted,a):
cats = self.collect_cat()
a += [inserted]
finish = len(a)/len(cats)*100
su = sum(a)
logger.info('目前商品總數: %s, 類別完成率: %s',su,finish)
def start(self):
cats = self.collect_cat()
logger.info('get %s cats', len(cats))
for i in cats:
self.loopurl(i)
self.syncbuf()
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("-d","--debug", help="getall ruten result",action="store_true")
args = parser.parse_args()
if args.debug:
logging.basicConfig(level=logging.DEBUG)
else:
logging.basicConfig(level=logging.INFO)
pilimit = 10000
a = list()
crawler = Crawler(pilimit)
crawler.start()
2.Request part
import time
import requests
requests.packages.urllib3.disable_warnings()
import os
import logging
logger = logging.getLogger(__name__)
class requesturl:
def __init__(self):
self.bot = requests.session()
def reachurl(self, url, referer='https://www.avac.co.jp/', rjson=False):
urlcontent = ""
for x in range(5):
headers = {}
headers['X-Requested-With'] = 'XMLHttpRequest'
api_url = url
try:
r = self.bot.get(api_url, headers=headers, timeout=180, verify=False, allow_redirects=False)
if r.ok:
if rjson:
urlcontent = r.json()
break
else:
urlcontent = r.content
if len(urlcontent) > 4000:
break
except Exception as inst:
logger.debug("Retry: %s %s %s", inst, x ,url)
time.sleep(10)
logger.debug('%s\t %s %s', url, os.getpid(), len(urlcontent))
else:
logger.debug('bot checkurl false: %s',url)
return urlcontent