Initial commit of the python script for scrapping
This commit is contained in:
commit
b6ec04072a
|
@ -0,0 +1,36 @@
|
|||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
import pandas as pd
|
||||
import time
|
||||
import random
|
||||
|
||||
nombre=[]
|
||||
afiliacion=[]
|
||||
url_investigador=[]
|
||||
|
||||
for item in range(1001,5000):
|
||||
url = 'http://www.caicyt-conicet.gov.ar/cientificos/items/show/'+str(item)
|
||||
try:
|
||||
page = requests.get(url)
|
||||
soup = BeautifulSoup(page.content, 'html.parser')
|
||||
|
||||
#Nombre
|
||||
nombre_temp = soup.find('div', id='area_titular').find('h1').text.strip()
|
||||
nombre.append(nombre_temp)
|
||||
#Afiliacion
|
||||
afiliacion_temp = soup.find('div', id='espInstitucionales').find('a').text
|
||||
afiliacion.append(afiliacion_temp)
|
||||
#URL
|
||||
url_investigador_temp = soup.find('div', class_='enlace').text
|
||||
url_investigador.append(url_investigador_temp)
|
||||
|
||||
#Progress tracking
|
||||
print (item)
|
||||
|
||||
time.sleep(random.random())
|
||||
|
||||
except:
|
||||
print('Error en item', item)
|
||||
|
||||
df = pd.DataFrame({'Nombre': nombre, 'Afiliacion': afiliacion, 'URL': url_investigador}, index=list(range(1,len(nombre)+1)))
|
||||
df.to_csv('db_conicet_temp.csv', index=False)
|
Loading…
Reference in New Issue