From b6ec04072a67dc511e2429adc71cc34b95167196 Mon Sep 17 00:00:00 2001 From: Sundering Date: Mon, 9 Aug 2021 18:25:29 -0500 Subject: [PATCH] Initial commit of the python script for scrapping --- scrap_conicet.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 scrap_conicet.py diff --git a/scrap_conicet.py b/scrap_conicet.py new file mode 100644 index 0000000..c870fb1 --- /dev/null +++ b/scrap_conicet.py @@ -0,0 +1,36 @@ +from bs4 import BeautifulSoup +import requests +import pandas as pd +import time +import random + +nombre=[] +afiliacion=[] +url_investigador=[] + +for item in range(1001,5000): + url = 'http://www.caicyt-conicet.gov.ar/cientificos/items/show/'+str(item) + try: + page = requests.get(url) + soup = BeautifulSoup(page.content, 'html.parser') + + #Nombre + nombre_temp = soup.find('div', id='area_titular').find('h1').text.strip() + nombre.append(nombre_temp) + #Afiliacion + afiliacion_temp = soup.find('div', id='espInstitucionales').find('a').text + afiliacion.append(afiliacion_temp) + #URL + url_investigador_temp = soup.find('div', class_='enlace').text + url_investigador.append(url_investigador_temp) + + #Progress tracking + print (item) + + time.sleep(random.random()) + + except: + print('Error en item', item) + +df = pd.DataFrame({'Nombre': nombre, 'Afiliacion': afiliacion, 'URL': url_investigador}, index=list(range(1,len(nombre)+1))) +df.to_csv('db_conicet_temp.csv', index=False)