67 lines
1.3 KiB
Python
67 lines
1.3 KiB
Python
import pprint
|
|
import requests
|
|
import pyquery
|
|
|
|
|
|
def kernel_1_sample_scrap(
|
|
max_articles=None,
|
|
):
|
|
if max_articles is None:
|
|
max_articles = 1
|
|
|
|
with requests.get(
|
|
'https://dev.to',
|
|
) as p:
|
|
t10 = p.content.decode('utf-8')
|
|
t11 = pyquery.PyQuery(t10)
|
|
t13 = t11('.crayons-story__title > a')
|
|
t12 = [
|
|
pyquery.PyQuery(o).attr('href')
|
|
for o in t13
|
|
]
|
|
pprint.pprint(t12)
|
|
t14 = [
|
|
'https://dev.to/%s' % o
|
|
for o in t12
|
|
]
|
|
|
|
t8 = []
|
|
for t7 in t14[:max_articles]:
|
|
with requests.get(
|
|
t7,
|
|
) as p:
|
|
t1 = p.content.decode('utf-8')
|
|
t2 = pyquery.PyQuery(t1)
|
|
t3 = t2('.comment__content')
|
|
t6 = []
|
|
for o in t3:
|
|
t4 = pyquery.PyQuery(o)
|
|
t5 = t4('.comment__header > a').attr['href']
|
|
t9 = t4('.comment__body').text()
|
|
t6.append(
|
|
dict(
|
|
author=t5,
|
|
text=t9,
|
|
)
|
|
)
|
|
|
|
#pprint.pprint(t3)
|
|
pprint.pprint(t6)
|
|
t8.append(
|
|
dict(
|
|
article=t7,
|
|
comments=t6,
|
|
)
|
|
)
|
|
|
|
pprint.pprint(t8)
|
|
|
|
return dict(
|
|
t1=t1,
|
|
t2=t2,
|
|
t3=t3,
|
|
t6=t6,
|
|
t8=t8,
|
|
t12=t12,
|
|
)
|