commit b6ec04072a67dc511e2429adc71cc34b95167196 Author: Sundering Date: Mon Aug 9 18:25:29 2021 -0500 Initial commit of the python script for scrapping diff --git a/scrap_conicet.py b/scrap_conicet.py new file mode 100644 index 0000000..c870fb1 --- /dev/null +++ b/scrap_conicet.py @@ -0,0 +1,36 @@ +from bs4 import BeautifulSoup +import requests +import pandas as pd +import time +import random + +nombre=[] +afiliacion=[] +url_investigador=[] + +for item in range(1001,5000): + url = 'http://www.caicyt-conicet.gov.ar/cientificos/items/show/'+str(item) + try: + page = requests.get(url) + soup = BeautifulSoup(page.content, 'html.parser') + + #Nombre + nombre_temp = soup.find('div', id='area_titular').find('h1').text.strip() + nombre.append(nombre_temp) + #Afiliacion + afiliacion_temp = soup.find('div', id='espInstitucionales').find('a').text + afiliacion.append(afiliacion_temp) + #URL + url_investigador_temp = soup.find('div', class_='enlace').text + url_investigador.append(url_investigador_temp) + + #Progress tracking + print (item) + + time.sleep(random.random()) + + except: + print('Error en item', item) + +df = pd.DataFrame({'Nombre': nombre, 'Afiliacion': afiliacion, 'URL': url_investigador}, index=list(range(1,len(nombre)+1))) +df.to_csv('db_conicet_temp.csv', index=False)