37 lines
1.0 KiB
Python
37 lines
1.0 KiB
Python
from bs4 import BeautifulSoup
|
|
import requests
|
|
import pandas as pd
|
|
import time
|
|
import random
|
|
|
|
nombre=[]
|
|
afiliacion=[]
|
|
url_investigador=[]
|
|
|
|
for item in range(1001,5000):
|
|
url = 'http://www.caicyt-conicet.gov.ar/cientificos/items/show/'+str(item)
|
|
try:
|
|
page = requests.get(url)
|
|
soup = BeautifulSoup(page.content, 'html.parser')
|
|
|
|
#Nombre
|
|
nombre_temp = soup.find('div', id='area_titular').find('h1').text.strip()
|
|
nombre.append(nombre_temp)
|
|
#Afiliacion
|
|
afiliacion_temp = soup.find('div', id='espInstitucionales').find('a').text
|
|
afiliacion.append(afiliacion_temp)
|
|
#URL
|
|
url_investigador_temp = soup.find('div', class_='enlace').text
|
|
url_investigador.append(url_investigador_temp)
|
|
|
|
#Progress tracking
|
|
print (item)
|
|
|
|
time.sleep(random.random())
|
|
|
|
except:
|
|
print('Error en item', item)
|
|
|
|
df = pd.DataFrame({'Nombre': nombre, 'Afiliacion': afiliacion, 'URL': url_investigador}, index=list(range(1,len(nombre)+1)))
|
|
df.to_csv('db_conicet_temp.csv', index=False)
|