Учебное пособие по очистке профиля Google Scholar и результатов автора в файл CSV с использованием библиотеки веб-очистки Python и SerpApi.
🔨Требуется: понимание циклов, структур данных, обработки исключений. serpapi
, pandas
, urllib
библиотеки.
⏱️Сколько времени это займет: ~15–30 минут на чтение и реализацию.
- Что будет соскабливать
- Предпосылки
- "Процесс"
- Результаты профиля
- Авторские результаты
- Все результаты статей авторов
- Сохранить в CSV
- Полный код
- Ссылки
- Аутро
Что будет очищено
Предпосылки
Отдельная виртуальная среда
Короче говоря, это то, что создает независимый набор установленных библиотек, включая разные версии Python, которые могут сосуществовать друг с другом в одной системе, что предотвращает конфликты библиотек или версий Python.
Если вы раньше не работали с виртуальной средой, ознакомьтесь с моей специальной записью в блоге Учебное пособие по виртуальной среде Python с использованием Virtualenv и Poetry, чтобы ознакомиться с ней.
Установите библиотеки:
pip install pandas, google-search-results
Процесс
Если вам не нужно объяснение:
Очистить все результаты профиля Google Scholar
import os from serpapi import GoogleSearch from urllib.parse import urlsplit, parse_qsl import pandas as pd
def profile_results():
print("Extracting profile results..")
params = { "api_key": os.getenv("API_KEY"), # SerpApi API key "engine": "google_scholar_profiles", # profile results search engine "mauthors": "blizzard", # search query } search = GoogleSearch(params)
profile_results_data = []
profiles_is_present = True while profiles_is_present:
profile_results = search.get_dict()
for profile in profile_results.get("profiles", []):
print(f'Currently extracting {profile.get("name")} with {profile.get("author_id")} ID.')
thumbnail = profile.get("thumbnail") name = profile.get("name") link = profile.get("link") author_id = profile.get("author_id") affiliations = profile.get("affiliations") email = profile.get("email") cited_by = profile.get("cited_by") interests = profile.get("interests")
profile_results_data.append({ "thumbnail": thumbnail, "name": name, "link": link, "author_id": author_id, "email": email, "affiliations": affiliations, "cited_by": cited_by, "interests": interests })
if "next" in profile_results["pagination"]: search.params_dict.update(dict(parse_qsl(urlsplit(profile_results["pagination"]["next"]).query))) else: profiles_is_present = False
return profile_results_data
Объяснение очистки всех результатов профиля
Импортировать библиотеки:
import os
from serpapi import GoogleSearch
from urllib.parse import urlsplit, parse_qsl
import pandas as pd
Передайте параметры поиска в SerpApi и создайте temp list()
:
params = { "api_key": os.getenv("API_KEY"), # SerpApi API key "engine": "google_scholar_profiles", # profile results search engine "mauthors": "blizzard", # search query } search = GoogleSearch(params)
profile_results_data = []
Настройте цикл while
и добавьте оператор if
для выхода из цикла while
, если не осталось страниц:
profiles_is_present = True while profiles_is_present:
profile_results = search.get_dict() # for loop extraction here.. # if next page in SerpApi pagination -> update params to new a page results. # if no next page -> exit the while loop. if "next" in profile_results.get("pagination", []): search.params_dict.update(dict(parse_qsl(urlsplit(profile_results.get("pagination").get("next")).query))) else: profiles_is_present = False
Итерация по профилю приводит к циклу for
:
for profile in profile_results.get("profiles", []):
print(f'Currently extracting {profile.get("name")} with {profile.get("author_id")} ID.')
thumbnail = profile.get("thumbnail") name = profile.get("name") link = profile.get("link") author_id = profile.get("author_id") affiliations = profile.get("affiliations") email = profile.get("email") cited_by = profile.get("cited_by") interests = profile.get("interests")
Добавить извлеченные данные во временный list
как словарь и return
его:
profile_results_data.append({ "thumbnail": thumbnail, "name": name, "link": link, "author_id": author_id, "email": email, "affiliations": affiliations, "cited_by": cited_by, "interests": interests
return profile_results_data
# example output: ''' Extracting profile results.. Currently extracting Adam Lobel with _xwYD2sAAAAJ ID. ... other profiles
[ { "thumbnail": "https://scholar.googleusercontent.com/citations?view_op=small_photo&user=_xwYD2sAAAAJ&citpid=3", "name": "Adam Lobel", "link": "https://scholar.google.com/citations?hl=en&user=_xwYD2sAAAAJ", "author_id": "_xwYD2sAAAAJ", "email": "Verified email at AdamLobel.com", "affiliations": "Blizzard Entertainment", "cited_by": 2935, "interests": [ { "title": "Gaming", "serpapi_link": "https://serpapi.com/search.json?engine=google_scholar_profiles&hl=en&mauthors=label%3Agaming", "link": "https://scholar.google.com/citations?hl=en&view_op=search_authors&mauthors=label:gaming" }, { "title": "Emotion regulation", "serpapi_link": "https://serpapi.com/search.json?engine=google_scholar_profiles&hl=en&mauthors=label%3Aemotion_regulation", "link": "https://scholar.google.com/citations?hl=en&view_op=search_authors&mauthors=label:emotion_regulation" } ] }, ... other profiles ] '''
Очистить результаты авторов Google Scholar
import os from serpapi import GoogleSearch from google_scholar_profile_results import profile_results from urllib.parse import urlsplit, parse_qsl import pandas as pd
def author_results():
print("extracting author results..")
author_results_data = []
for author_id in profile_results():
print(f"Parsing {author_id['author_id']} author ID.")
params = { "api_key": os.getenv("API_KEY"), # SerpApi API key "engine": "google_scholar_author", # author results search engine "author_id": author_id["author_id"], # search query "hl": "en" } search = GoogleSearch(params) results = search.get_dict()
thumbnail = results.get("author").get("thumbnail") name = results.get("author").get("name") affiliations = results.get("author").get("affiliations") email = results.get("author").get("email") website = results.get("author").get("website") interests = results.get("author").get("interests")
cited_by_table = results.get("cited_by", {}).get("table") cited_by_graph = results.get("cited_by", {}).get("graph")
public_access_link = results.get("public_access", {}).get("link") available_public_access = results.get("public_access", {}).get("available") not_available_public_access = results.get("public_access", {}).get("not_available") co_authors = results.get("co_authors")
author_results_data.append({ "thumbnail": thumbnail, "name": name, "affiliations": affiliations, "email": email, "website": website, "interests": interests, "cited_by_table": cited_by_table, "cited_by_graph": cited_by_graph, "public_access_link": public_access_link, "available_public_access": available_public_access, "not_available_public_access": not_available_public_access, "co_authors": co_authors })
return author_results_data
Объяснение результатов авторского парсинга
Импортируйте функцию profile_results()
и другие библиотеки:
import os
from serpapi import GoogleSearch
from google_scholar_profile_results import profile_results
from urllib.parse import urlsplit, parse_qsl
import pandas as pd
profile_results()
будет перебирать все доступные страницы и возвращать словарь, включая результат идентификатора автора, например _xwYD2sAAAAJ
.
Создайте временный list
для хранения извлеченных данных:
author_results_data = []
Перебрать извлеченные профили, передать параметр поиска с author_id
по author_id
:
for author_id in profile_results():
print(f"Parsing {author_id['author_id']} author ID.")
params = { "api_key": os.getenv("API_KEY"), # SerpApi API key "engine": "google_scholar_author", # author results search engine "author_id": author_id["author_id"], # search query: _xwYD2sAAAAJ "hl": "en" } search = GoogleSearch(params) results = search.get_dict()
Извлеките данные:
thumbnail = results.get("author").get("thumbnail") name = results.get("author").get("name") affiliations = results.get("author").get("affiliations") email = results.get("author").get("email") website = results.get("author").get("website") interests = results.get("author").get("interests")
cited_by_table = results.get("cited_by", {}).get("table") cited_by_graph = results.get("cited_by", {}).get("graph")
public_access_link = results.get("public_access", {}).get("link") available_public_access = results.get("public_access", {}).get("available") not_available_public_access = results.get("public_access", {}).get("not_available") co_authors = results.get("co_authors")
Добавить извлеченные данные во временный list
в виде словаря и return
его:
author_results_data.append({ "thumbnail": thumbnail, "name": name, "affiliations": affiliations, "email": email, "website": website, "interests": interests, "cited_by_table": cited_by_table, "cited_by_graph": cited_by_graph, "public_access_link": public_access_link, "available_public_access": available_public_access, "not_available_public_access": not_available_public_access, "co_authors": co_authors })
return author_results_data
# example output: ''' extracting author results.. Extracting profile results.. Currently extracting Adam Lobel with _xwYD2sAAAAJ ID. ... other authors Parsing _xwYD2sAAAAJ author ID. ... other authors
[ { "thumbnail": "https://scholar.googleusercontent.com/citations?view_op=view_photo&user=_xwYD2sAAAAJ&citpid=3", "name": "Adam Lobel", "affiliations": "Blizzard Entertainment", "email": "Verified email at AdamLobel.com", "website": "https://twitter.com/GrowingUpGaming", "interests": [ { "title": "Gaming", "link": "https://scholar.google.com/citations?view_op=search_authors&hl=en&mauthors=label:gaming", "serpapi_link": "https://serpapi.com/search.json?engine=google_scholar_profiles&hl=en&mauthors=label%3Agaming" }, { "title": "Emotion regulation", "link": "https://scholar.google.com/citations?view_op=search_authors&hl=en&mauthors=label:emotion_regulation", "serpapi_link": "https://serpapi.com/search.json?engine=google_scholar_profiles&hl=en&mauthors=label%3Aemotion_regulation" } ], "cited_by_table": [ { "citations": { "all": 2935, "since_2017": 2348 } }, { "h_index": { "all": 10, "since_2017": 10 } }, { "i10_index": { "all": 11, "since_2017": 10 } } ], "cited_by_graph": [ { "year": 2014, "citations": 70 }, { "year": 2015, "citations": 188 }, { "year": 2016, "citations": 243 }, { "year": 2017, "citations": 342 }, { "year": 2018, "citations": 420 }, { "year": 2019, "citations": 553 }, { "year": 2020, "citations": 507 }, { "year": 2021, "citations": 504 }, { "year": 2022, "citations": 16 } ], "public_access_link": "https://scholar.google.com/citations?view_op=list_mandates&hl=en&user=_xwYD2sAAAAJ", "available_public_access": 1, "not_available_public_access": 0, "co_authors": [ { "name": "Isabela Granic", "link": "https://scholar.google.com/citations?user=4T5cjVIAAAAJ&hl=en", "serpapi_link": "https://serpapi.com/search.json?author_id=4T5cjVIAAAAJ&engine=google_scholar_author&hl=en", "author_id": "4T5cjVIAAAAJ", "affiliations": "Radboud University Nijmegen", "email": "Verified email at pwo.ru.nl", "thumbnail": "https://scholar.googleusercontent.com/citations?view_op=small_photo&user=4T5cjVIAAAAJ&citpid=4" }, ... other co-authors } ] } ... other authors ] '''
Очистить все авторские статьи из Google Scholar
import os from serpapi import GoogleSearch from google_scholar_profile_results import profile_results from urllib.parse import urlsplit, parse_qsl import pandas as pd
def all_author_articles():
author_article_results_data = []
for index, author_id in enumerate(profile_results(), start=1):
print(f"Parsing {index} author with {author_id['author_id']} author ID.")
params = { "api_key": os.getenv("API_KEY"), # SerpApi API key "engine": "google_scholar_author", # author results search engine "hl": "en", # language "sort": "pubdate", # sort by year "author_id": author_id["author_id"] # search query } search = GoogleSearch(params)
articles_is_present = True while articles_is_present:
results = search.get_dict()
for article in results.get("articles", []): title = article.get("title") link = article.get("link") citation_id = article.get("citation_id") authors = article.get("authors") publication = article.get("publication") cited_by_value = article.get("cited_by", {}).get("value") cited_by_link = article.get("cited_by", {}).get("link") cited_by_cites_id = article.get("cited_by", {}).get("cites_id") year = article.get("year") author_article_results_data.append({ "article_title": title, "article_link": link, "article_year": year, "article_citation_id": citation_id, "article_authors": authors, "article_publication": publication, "article_cited_by_value": cited_by_value, "article_cited_by_link": cited_by_link, "article_cited_by_cites_id": cited_by_cites_id, }) if "next" in results.get("serpapi_pagination", []): search.params_dict.update(dict(parse_qsl(urlsplit(results.get("serpapi_pagination").get("next")).query))) else: articles_is_present = False
return author_article_results_data
Удаление всех пояснений к статьям автора
Импортируйте функцию profile_results()
и другие библиотеки:
import os
from serpapi import GoogleSearch
from google_scholar_profile_results import profile_results
from urllib.parse import urlsplit, parse_qsl
import pandas as pd
В данном случае profile_results()
также использовалось для получения author_id
для разбора авторских статей.
Создайте временный list
для хранения извлеченных данных:
author_article_results_data = []
Перебрать profile_results()
и передать author_id
в запрос поиска параметров:
for index, author_id in enumerate(profile_results(), start=1):
print(f"Parsing {index} author with {author_id['author_id']} author ID.") params = { "api_key": os.getenv("API_KEY"), # SerpApi API key "engine": "google_scholar_author", # author results search engine "hl": "en", # language "sort": "pubdate", # sort by year "author_id": author_id["author_id"] # search query } search = GoogleSearch(params)
Настройте цикл while
и проверьте наличие if
следующей страницы:
articles_is_present = True
while articles_is_present:
results = search.get_dict()
# data extraction code..
# if next page is present -> update previous results to new page results.
# if next page is not present -> exit the while loop.
if "next" in results.get("serpapi_pagination", []):
search.params_dict.update(dict(parse_qsl(urlsplit(results.get("serpapi_pagination").get("next")).query)))
else:
articles_is_present = False
Извлечь данные в цикле for
:
for article in results.get("articles", []):
title = article.get("title")
link = article.get("link")
citation_id = article.get("citation_id")
authors = article.get("authors")
publication = article.get("publication")
cited_by_value = article.get("cited_by", {}).get("value")
cited_by_link = article.get("cited_by", {}).get("link")
cited_by_cites_id = article.get("cited_by", {}).get("cites_id")
year = article.get("year")
Append
извлек данные во временный list
в виде словаря:
author_article_results_data.append({
"article_title": title,
"article_link": link,
"article_year": year,
"article_citation_id": citation_id,
"article_authors": authors,
"article_publication": publication,
"article_cited_by_value": cited_by_value,
"article_cited_by_link": cited_by_link,
"article_cited_by_cites_id": cited_by_cites_id,
})
Return
извлеченные данные:
return author_article_results_data
Сохранить профиль Google Scholar и результаты авторов в формате CSV
from google_scholar_profile_results import profile_results import pandas as pd
def save_profile_results_to_csv(): print("Waiting for profile results to save..") pd.DataFrame(data=profile_results()).to_csv("google_scholar_profile_results.csv", encoding="utf-8", index=False)
print("Profile Results Saved.")
def save_author_result_to_csv(): print("Waiting for author results to save..") pd.DataFrame(data=profile_results()).to_csv("google_scholar_author_results.csv", encoding="utf-8", index=False)
print("Author Results Saved.")
def save_author_articles_to_csv(): print("Waiting for author articles to save..") pd.DataFrame(data=profile_results()).to_csv("google_scholar_author_articles.csv", encoding="utf-8", index=False)
print("Author Articles Saved.")
- Аргумент
data
внутриDataFrame
— это ваши данные. encoding='utf-8'
просто чтобы убедиться, что все будет сохранено правильно. Я использовал его явно, даже думая, что это значение по умолчанию.- Аргумент
index=False
для удаления номеров строк по умолчаниюpandas
.
Полный код
import os from serpapi import GoogleSearch from urllib.parse import urlsplit, parse_qsl import pandas as pd
def profile_results(): print("Extracting profile results..")
params = { "api_key": os.getenv("API_KEY"), # SerpApi API key "engine": "google_scholar_profiles", # profile results search engine "mauthors": "blizzard", # search query } search = GoogleSearch(params)
profile_results_data = []
profiles_is_present = True while profiles_is_present: profile_results = search.get_dict()
for profile in profile_results.get("profiles", []):
print(f'Currently extracting {profile.get("name")} with {profile.get("author_id")} ID.')
thumbnail = profile.get("thumbnail") name = profile.get("name") link = profile.get("link") author_id = profile.get("author_id") affiliations = profile.get("affiliations") email = profile.get("email") cited_by = profile.get("cited_by") interests = profile.get("interests")
profile_results_data.append({ "thumbnail": thumbnail, "name": name, "link": link, "author_id": author_id, "email": email, "affiliations": affiliations, "cited_by": cited_by, "interests": interests })
if "next" in profile_results.get("pagination", []): search.params_dict.update(dict(parse_qsl(urlsplit(profile_results.get("pagination").get("next")).query))) else: profiles_is_present = False
return profile_results_data
def author_results(): print("extracting author results..")
author_results_data = []
for author_id in profile_results():
print(f"Parsing {author_id['author_id']} author ID.")
params = { "api_key": os.getenv("API_KEY"), # SerpApi API key "engine": "google_scholar_author", # author results search engine "author_id": author_id["author_id"], # search query "hl": "en" } search = GoogleSearch(params) results = search.get_dict()
thumbnail = results.get("author").get("thumbnail") name = results.get("author").get("name") affiliations = results.get("author").get("affiliations") email = results.get("author").get("email") website = results.get("author").get("website") interests = results.get("author").get("interests")
cited_by_table = results.get("cited_by", {}).get("table") cited_by_graph = results.get("cited_by", {}).get("graph")
public_access_link = results.get("public_access", {}).get("link") available_public_access = results.get("public_access", {}).get("available") not_available_public_access = results.get("public_access", {}).get("not_available") co_authors = results.get("co_authors")
author_results_data.append({ "thumbnail": thumbnail, "name": name, "affiliations": affiliations, "email": email, "website": website, "interests": interests, "cited_by_table": cited_by_table, "cited_by_graph": cited_by_graph, "public_access_link": public_access_link, "available_public_access": available_public_access, "not_available_public_access": not_available_public_access, "co_authors": co_authors })
return author_results_data
def all_author_articles(): author_article_results_data = []
for index, author_id in enumerate(profile_results(), start=1):
print(f"Parsing author #{index} with {author_id['author_id']} author ID.")
params = { "api_key": os.getenv("API_KEY"), # SerpApi API key "engine": "google_scholar_author", # author results search engine "hl": "en", # language "sort": "pubdate", # sort by year "author_id": author_id["author_id"] # search query } search = GoogleSearch(params)
articles_is_present = True while articles_is_present: results = search.get_dict()
for article in results.get("articles", []): title = article.get("title") link = article.get("link") citation_id = article.get("citation_id") authors = article.get("authors") publication = article.get("publication") cited_by_value = article.get("cited_by", {}).get("value") cited_by_link = article.get("cited_by", {}).get("link") cited_by_cites_id = article.get("cited_by", {}).get("cites_id") year = article.get("year")
author_article_results_data.append({ "article_title": title, "article_link": link, "article_year": year, "article_citation_id": citation_id, "article_authors": authors, "article_publication": publication, "article_cited_by_value": cited_by_value, "article_cited_by_link": cited_by_link, "article_cited_by_cites_id": cited_by_cites_id, })
if "next" in results.get("serpapi_pagination", []): search.params_dict.update(dict(parse_qsl(urlsplit(results.get("serpapi_pagination").get("next")).query))) else: articles_is_present = False
return author_article_results_data
def save_author_result_to_csv(): print("Waiting for author results to save..") pd.DataFrame(data=profile_results()).to_csv("google_scholar_author_results.csv", encoding="utf-8", index=False)
print("Author Results Saved.")
def save_author_articles_to_csv(): print("Waiting for author articles to save..") pd.DataFrame(data=profile_results()).to_csv("google_scholar_author_articles.csv", encoding="utf-8", index=False)
print("Author Articles Saved.")
def save_profile_results_to_csv(): print("Waiting for profile results to save..") pd.DataFrame(data=profile_results()).to_csv("google_scholar_profile_results.csv", encoding="utf-8", index=False)
print("Profile Results Saved.")
Ссылки
Outro
Если ваша цель — извлечь данные без необходимости писать парсер с нуля, разобраться, как обойти блокировки поисковых систем, как его масштабировать или как извлечь данные из JavaScript — попробуйте SerpApi.
Если у вас есть чем поделиться, какие-либо вопросы, предложения или что-то, что работает неправильно, свяжитесь с нами через Twitter по адресу @dimitryzub или @serp_api.
Присоединяйтесь к нам на Reddit | Твиттер | "YouTube"
Добавьте Запрос функции💫 или Ошибку🐞
Больше контента на plainenglish.io. Подпишитесь на нашу бесплатную еженедельную рассылку новостей. Получите эксклюзивный доступ к возможностям написания и советам в нашем сообществе Discord.