Initial commit of the python script for scrapping

This commit is contained in:
Albert Strusberg 2021-08-09 18:25:29 -05:00
commit b6ec04072a
1 changed files with 36 additions and 0 deletions

36
scrap_conicet.py Normal file
View File

@ -0,0 +1,36 @@
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time
import random
nombre=[]
afiliacion=[]
url_investigador=[]
for item in range(1001,5000):
url = 'http://www.caicyt-conicet.gov.ar/cientificos/items/show/'+str(item)
try:
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
#Nombre
nombre_temp = soup.find('div', id='area_titular').find('h1').text.strip()
nombre.append(nombre_temp)
#Afiliacion
afiliacion_temp = soup.find('div', id='espInstitucionales').find('a').text
afiliacion.append(afiliacion_temp)
#URL
url_investigador_temp = soup.find('div', class_='enlace').text
url_investigador.append(url_investigador_temp)
#Progress tracking
print (item)
time.sleep(random.random())
except:
print('Error en item', item)
df = pd.DataFrame({'Nombre': nombre, 'Afiliacion': afiliacion, 'URL': url_investigador}, index=list(range(1,len(nombre)+1)))
df.to_csv('db_conicet_temp.csv', index=False)