Initial commit of the python script for scrapping

2021-08-09 18:25:29 -05:00 · 2021-08-09 18:25:29 -05:00 · b6ec04072a
commit b6ec04072a
1 changed files with 36 additions and 0 deletions
--- a/scrap_conicet.py
+++ b/scrap_conicet.py
@ -0,0 +1,36 @@
+from bs4 import BeautifulSoup
+import requests
+import pandas as pd
+import time
+import random
+
+nombre=[]
+afiliacion=[]
+url_investigador=[]
+
+for item in range(1001,5000):
+    url = 'http://www.caicyt-conicet.gov.ar/cientificos/items/show/'+str(item)
+    try:
+        page = requests.get(url)
+        soup = BeautifulSoup(page.content, 'html.parser')
+
+        #Nombre
+        nombre_temp = soup.find('div', id='area_titular').find('h1').text.strip()
+        nombre.append(nombre_temp)
+        #Afiliacion
+        afiliacion_temp = soup.find('div', id='espInstitucionales').find('a').text
+        afiliacion.append(afiliacion_temp)
+        #URL
+        url_investigador_temp = soup.find('div', class_='enlace').text
+        url_investigador.append(url_investigador_temp)
+
+        #Progress tracking
+        print (item)
+
+        time.sleep(random.random())
+
+    except:
+        print('Error en item', item)
+
+df = pd.DataFrame({'Nombre': nombre, 'Afiliacion': afiliacion, 'URL': url_investigador}, index=list(range(1,len(nombre)+1)))
+df.to_csv('db_conicet_temp.csv', index=False)