Учебное пособие по очистке профиля Google Scholar и результатов автора в файл CSV с использованием библиотеки веб-очистки Python и SerpApi.

🔨Требуется: понимание циклов, структур данных, обработки исключений. serpapi, pandas, urllib библиотеки.

⏱️Сколько времени это займет: ~15–30 минут на чтение и реализацию.

Что будет очищено

Предпосылки

Отдельная виртуальная среда

Короче говоря, это то, что создает независимый набор установленных библиотек, включая разные версии Python, которые могут сосуществовать друг с другом в одной системе, что предотвращает конфликты библиотек или версий Python.

Если вы раньше не работали с виртуальной средой, ознакомьтесь с моей специальной записью в блоге Учебное пособие по виртуальной среде Python с использованием Virtualenv и Poetry, чтобы ознакомиться с ней.

Установите библиотеки:

pip install pandas, google-search-results

Процесс

Если вам не нужно объяснение:

Очистить все результаты профиля Google Scholar

import os
from serpapi import GoogleSearch
from urllib.parse import urlsplit, parse_qsl
import pandas as pd
def profile_results():
    print("Extracting profile results..")
    params = {
        "api_key": os.getenv("API_KEY"),      # SerpApi API key
        "engine": "google_scholar_profiles",  # profile results search engine
        "mauthors": "blizzard",               # search query
    }
    search = GoogleSearch(params)
    profile_results_data = []
    profiles_is_present = True
    while profiles_is_present:
        profile_results = search.get_dict()
        for profile in profile_results.get("profiles", []):
            print(f'Currently extracting {profile.get("name")} with {profile.get("author_id")} ID.')
            thumbnail = profile.get("thumbnail")
            name = profile.get("name")
            link = profile.get("link")
            author_id = profile.get("author_id")
            affiliations = profile.get("affiliations")
            email = profile.get("email")
            cited_by = profile.get("cited_by")
            interests = profile.get("interests")
            profile_results_data.append({
                "thumbnail": thumbnail,
                "name": name,
                "link": link,
                "author_id": author_id,
                "email": email,
                "affiliations": affiliations,
                "cited_by": cited_by,
                "interests": interests
            })
            if "next" in profile_results["pagination"]:
                search.params_dict.update(dict(parse_qsl(urlsplit(profile_results["pagination"]["next"]).query)))
            else:
                profiles_is_present = False
    return profile_results_data

Объяснение очистки всех результатов профиля

Импортировать библиотеки:

import os
from serpapi import GoogleSearch
from urllib.parse import urlsplit, parse_qsl
import pandas as pd

Передайте параметры поиска в SerpApi и создайте temp list():

params = {
    "api_key": os.getenv("API_KEY"),      # SerpApi API key
    "engine": "google_scholar_profiles",  # profile results search engine
    "mauthors": "blizzard",               # search query
}
search = GoogleSearch(params)
profile_results_data = []

Настройте цикл while и добавьте оператор if для выхода из цикла while, если не осталось страниц:

profiles_is_present = True
while profiles_is_present:
    profile_results = search.get_dict()
    
    # for loop extraction here..
    
    # if next page in SerpApi pagination -> update params to new a page results.
    # if no next page -> exit the while loop.
    if "next" in profile_results.get("pagination", []):
        search.params_dict.update(dict(parse_qsl(urlsplit(profile_results.get("pagination").get("next")).query)))
    else:
        profiles_is_present = False

Итерация по профилю приводит к циклу for:

for profile in profile_results.get("profiles", []):
    print(f'Currently extracting {profile.get("name")} with {profile.get("author_id")} ID.')
    thumbnail = profile.get("thumbnail")
    name = profile.get("name")
    link = profile.get("link")
    author_id = profile.get("author_id")
    affiliations = profile.get("affiliations")
    email = profile.get("email")
    cited_by = profile.get("cited_by")
    interests = profile.get("interests")

Добавить извлеченные данные во временный list как словарь и return его:

profile_results_data.append({
    "thumbnail": thumbnail,
    "name": name,
    "link": link,
    "author_id": author_id,
    "email": email,
    "affiliations": affiliations,
    "cited_by": cited_by,
    "interests": interests
return profile_results_data
# example output:
'''
Extracting profile results..
Currently extracting Adam Lobel with _xwYD2sAAAAJ ID.
... other profiles
[
  {
    "thumbnail": "https://scholar.googleusercontent.com/citations?view_op=small_photo&user=_xwYD2sAAAAJ&citpid=3",
    "name": "Adam Lobel",
    "link": "https://scholar.google.com/citations?hl=en&user=_xwYD2sAAAAJ",
    "author_id": "_xwYD2sAAAAJ",
    "email": "Verified email at AdamLobel.com",
    "affiliations": "Blizzard Entertainment",
    "cited_by": 2935,
    "interests": [
      {
        "title": "Gaming",
        "serpapi_link": "https://serpapi.com/search.json?engine=google_scholar_profiles&hl=en&mauthors=label%3Agaming",
        "link": "https://scholar.google.com/citations?hl=en&view_op=search_authors&mauthors=label:gaming"
      },
      {
        "title": "Emotion regulation",
        "serpapi_link": "https://serpapi.com/search.json?engine=google_scholar_profiles&hl=en&mauthors=label%3Aemotion_regulation",
        "link": "https://scholar.google.com/citations?hl=en&view_op=search_authors&mauthors=label:emotion_regulation"
      }
    ]
  },
  ... other profiles
]
'''

Очистить результаты авторов Google Scholar

import os
from serpapi import GoogleSearch
from google_scholar_profile_results import profile_results
from urllib.parse import urlsplit, parse_qsl
import pandas as pd
def author_results():
    print("extracting author results..")
    author_results_data = []
    for author_id in profile_results():
        print(f"Parsing {author_id['author_id']} author ID.")
        params = {
            "api_key": os.getenv("API_KEY"),      # SerpApi API key
            "engine": "google_scholar_author",    # author results search engine
            "author_id": author_id["author_id"],  # search query
            "hl": "en"
        }
        search = GoogleSearch(params)
        results = search.get_dict()
        thumbnail = results.get("author").get("thumbnail")
        name = results.get("author").get("name")
        affiliations = results.get("author").get("affiliations")
        email = results.get("author").get("email")
        website = results.get("author").get("website")
        interests = results.get("author").get("interests")
        cited_by_table = results.get("cited_by", {}).get("table")
        cited_by_graph = results.get("cited_by", {}).get("graph")
        public_access_link = results.get("public_access", {}).get("link")
        available_public_access = results.get("public_access", {}).get("available")
        not_available_public_access = results.get("public_access", {}).get("not_available")
        co_authors = results.get("co_authors")
        author_results_data.append({
          "thumbnail": thumbnail,
          "name": name,
          "affiliations": affiliations,
          "email": email,
          "website": website,
          "interests": interests,
          "cited_by_table": cited_by_table,
          "cited_by_graph": cited_by_graph,
          "public_access_link": public_access_link,
          "available_public_access": available_public_access,
          "not_available_public_access": not_available_public_access,
          "co_authors": co_authors
        })
    return author_results_data

Объяснение результатов авторского парсинга

Импортируйте функцию profile_results() и другие библиотеки:

import os
from serpapi import GoogleSearch
from google_scholar_profile_results import profile_results
from urllib.parse import urlsplit, parse_qsl
import pandas as pd

profile_results() будет перебирать все доступные страницы и возвращать словарь, включая результат идентификатора автора, например _xwYD2sAAAAJ.

Создайте временный list для хранения извлеченных данных:

author_results_data = []

Перебрать извлеченные профили, передать параметр поиска с author_id по author_id:

for author_id in profile_results():
    print(f"Parsing {author_id['author_id']} author ID.")
    params = {
        "api_key": os.getenv("API_KEY"),      # SerpApi API key
        "engine": "google_scholar_author",    # author results search engine
        "author_id": author_id["author_id"],  # search query: _xwYD2sAAAAJ
        "hl": "en"
    }
    search = GoogleSearch(params)
    results = search.get_dict()

Извлеките данные:

thumbnail = results.get("author").get("thumbnail")
name = results.get("author").get("name")
affiliations = results.get("author").get("affiliations")
email = results.get("author").get("email")
website = results.get("author").get("website")
interests = results.get("author").get("interests")
cited_by_table = results.get("cited_by", {}).get("table")
cited_by_graph = results.get("cited_by", {}).get("graph")
public_access_link = results.get("public_access", {}).get("link")
available_public_access = results.get("public_access", {}).get("available")
not_available_public_access = results.get("public_access", {}).get("not_available")
co_authors = results.get("co_authors")

Добавить извлеченные данные во временный list в виде словаря и return его:

author_results_data.append({
    "thumbnail": thumbnail,
    "name": name,
    "affiliations": affiliations,
    "email": email,
    "website": website,
    "interests": interests,
    "cited_by_table": cited_by_table,
    "cited_by_graph": cited_by_graph,
    "public_access_link": public_access_link,
    "available_public_access": available_public_access,
    "not_available_public_access": not_available_public_access,
    "co_authors": co_authors
})
return author_results_data

# example output:
'''
extracting author results..
Extracting profile results..
Currently extracting Adam Lobel with _xwYD2sAAAAJ ID.
... other authors
Parsing _xwYD2sAAAAJ author ID.
... other authors
[
  {
    "thumbnail": "https://scholar.googleusercontent.com/citations?view_op=view_photo&user=_xwYD2sAAAAJ&citpid=3",
    "name": "Adam Lobel",
    "affiliations": "Blizzard Entertainment",
    "email": "Verified email at AdamLobel.com",
    "website": "https://twitter.com/GrowingUpGaming",
    "interests": [
      {
        "title": "Gaming",
        "link": "https://scholar.google.com/citations?view_op=search_authors&hl=en&mauthors=label:gaming",
        "serpapi_link": "https://serpapi.com/search.json?engine=google_scholar_profiles&hl=en&mauthors=label%3Agaming"
      },
      {
        "title": "Emotion regulation",
        "link": "https://scholar.google.com/citations?view_op=search_authors&hl=en&mauthors=label:emotion_regulation",
        "serpapi_link": "https://serpapi.com/search.json?engine=google_scholar_profiles&hl=en&mauthors=label%3Aemotion_regulation"
      }
    ],
    "cited_by_table": [
      {
        "citations": {
          "all": 2935,
          "since_2017": 2348
        }
      },
      {
        "h_index": {
          "all": 10,
          "since_2017": 10
        }
      },
      {
        "i10_index": {
          "all": 11,
          "since_2017": 10
        }
      }
    ],
    "cited_by_graph": [
      {
        "year": 2014,
        "citations": 70
      },
      {
        "year": 2015,
        "citations": 188
      },
      {
        "year": 2016,
        "citations": 243
      },
      {
        "year": 2017,
        "citations": 342
      },
      {
        "year": 2018,
        "citations": 420
      },
      {
        "year": 2019,
        "citations": 553
      },
      {
        "year": 2020,
        "citations": 507
      },
      {
        "year": 2021,
        "citations": 504
      },
      {
        "year": 2022,
        "citations": 16
      }
    ],
    "public_access_link": "https://scholar.google.com/citations?view_op=list_mandates&hl=en&user=_xwYD2sAAAAJ",
    "available_public_access": 1,
    "not_available_public_access": 0,
    "co_authors": [
      {
        "name": "Isabela Granic",
        "link": "https://scholar.google.com/citations?user=4T5cjVIAAAAJ&hl=en",
        "serpapi_link": "https://serpapi.com/search.json?author_id=4T5cjVIAAAAJ&engine=google_scholar_author&hl=en",
        "author_id": "4T5cjVIAAAAJ",
        "affiliations": "Radboud University Nijmegen",
        "email": "Verified email at pwo.ru.nl",
        "thumbnail": "https://scholar.googleusercontent.com/citations?view_op=small_photo&user=4T5cjVIAAAAJ&citpid=4"
      },
      ... other co-authors
      }
    ]
  }
  ... other authors
]
'''

Очистить все авторские статьи из Google Scholar

import os
from serpapi import GoogleSearch
from google_scholar_profile_results import profile_results
from urllib.parse import urlsplit, parse_qsl
import pandas as pd
def all_author_articles():
    author_article_results_data = []
    for index, author_id in enumerate(profile_results(), start=1):
        print(f"Parsing {index} author with {author_id['author_id']} author ID.")
        params = {
            "api_key": os.getenv("API_KEY"),     # SerpApi API key
            "engine": "google_scholar_author",   # author results search engine
            "hl": "en",                          # language
            "sort": "pubdate",                   # sort by year
            "author_id": author_id["author_id"]  # search query
        }
        search = GoogleSearch(params)
        articles_is_present = True
        while articles_is_present:
            results = search.get_dict()
            for article in results.get("articles", []):
                title = article.get("title")
                link = article.get("link")
                citation_id = article.get("citation_id")
                authors = article.get("authors")
                publication = article.get("publication")
                cited_by_value = article.get("cited_by", {}).get("value")
                cited_by_link = article.get("cited_by", {}).get("link")
                cited_by_cites_id = article.get("cited_by", {}).get("cites_id")
                year = article.get("year")
  
                author_article_results_data.append({
                    "article_title": title,
                    "article_link": link,
                    "article_year": year,
                    "article_citation_id": citation_id,
                    "article_authors": authors,
                    "article_publication": publication,
                    "article_cited_by_value": cited_by_value,
                    "article_cited_by_link": cited_by_link,
                    "article_cited_by_cites_id": cited_by_cites_id,
                })
    
          if "next" in results.get("serpapi_pagination", []):
              search.params_dict.update(dict(parse_qsl(urlsplit(results.get("serpapi_pagination").get("next")).query)))
          else:
              articles_is_present = False
    return author_article_results_data

Удаление всех пояснений к статьям автора

Импортируйте функцию profile_results() и другие библиотеки:

import os
from serpapi import GoogleSearch
from google_scholar_profile_results import profile_results
from urllib.parse import urlsplit, parse_qsl
import pandas as pd

В данном случае profile_results() также использовалось для получения author_id для разбора авторских статей.

Создайте временный list для хранения извлеченных данных:

author_article_results_data = []

Перебрать profile_results() и передать author_id в запрос поиска параметров:

for index, author_id in enumerate(profile_results(), start=1):
    print(f"Parsing {index} author with {author_id['author_id']} author ID.")
  
    params = {
        "api_key": os.getenv("API_KEY"),     # SerpApi API key
        "engine": "google_scholar_author",   # author results search engine
        "hl": "en",                          # language
        "sort": "pubdate",                   # sort by year
        "author_id": author_id["author_id"]  # search query
    }
    search = GoogleSearch(params)

Настройте цикл while и проверьте наличие if следующей страницы:

articles_is_present = True
while articles_is_present:
    results = search.get_dict()
    
    # data extraction code..
    
    # if next page is present -> update previous results to new page results.
    # if next page is not present -> exit the while loop.
    if "next" in results.get("serpapi_pagination", []):
      search.params_dict.update(dict(parse_qsl(urlsplit(results.get("serpapi_pagination").get("next")).query)))
    else:
      articles_is_present = False

Извлечь данные в цикле for:

for article in results.get("articles", []):
    title = article.get("title")
    link = article.get("link")
    citation_id = article.get("citation_id")
    authors = article.get("authors")
    publication = article.get("publication")
    cited_by_value = article.get("cited_by", {}).get("value")
    cited_by_link = article.get("cited_by", {}).get("link")
    cited_by_cites_id = article.get("cited_by", {}).get("cites_id")
    year = article.get("year")

Append извлек данные во временный list в виде словаря:

author_article_results_data.append({
    "article_title": title,
    "article_link": link,
    "article_year": year,
    "article_citation_id": citation_id,
    "article_authors": authors,
    "article_publication": publication,
    "article_cited_by_value": cited_by_value,
    "article_cited_by_link": cited_by_link,
    "article_cited_by_cites_id": cited_by_cites_id,
})

Return извлеченные данные:

return author_article_results_data

Сохранить профиль Google Scholar и результаты авторов в формате CSV

from google_scholar_profile_results import profile_results
import pandas as pd
def save_profile_results_to_csv():
    print("Waiting for profile results to save..")
    pd.DataFrame(data=profile_results()).to_csv("google_scholar_profile_results.csv", encoding="utf-8", index=False)
    print("Profile Results Saved.")
    
def save_author_result_to_csv():
    print("Waiting for author results to save..")
    pd.DataFrame(data=profile_results()).to_csv("google_scholar_author_results.csv", encoding="utf-8", index=False)
    print("Author Results Saved.")

def save_author_articles_to_csv():
    print("Waiting for author articles to save..")
    pd.DataFrame(data=profile_results()).to_csv("google_scholar_author_articles.csv", encoding="utf-8", index=False)
    print("Author Articles Saved.")
  • Аргумент data внутри DataFrame — это ваши данные.
  • encoding='utf-8' просто чтобы убедиться, что все будет сохранено правильно. Я использовал его явно, даже думая, что это значение по умолчанию.
  • Аргумент index=False для удаления номеров строк по умолчанию pandas.

Полный код

import os
from serpapi import GoogleSearch
from urllib.parse import urlsplit, parse_qsl
import pandas as pd

def profile_results():
    print("Extracting profile results..")
    params = {
        "api_key": os.getenv("API_KEY"),      # SerpApi API key
        "engine": "google_scholar_profiles",  # profile results search engine
        "mauthors": "blizzard",               # search query
    }
    search = GoogleSearch(params)
    profile_results_data = []
    profiles_is_present = True
    while profiles_is_present:
        profile_results = search.get_dict()
        for profile in profile_results.get("profiles", []):
            print(f'Currently extracting {profile.get("name")} with {profile.get("author_id")} ID.')
            thumbnail = profile.get("thumbnail")
            name = profile.get("name")
            link = profile.get("link")
            author_id = profile.get("author_id")
            affiliations = profile.get("affiliations")
            email = profile.get("email")
            cited_by = profile.get("cited_by")
            interests = profile.get("interests")
            profile_results_data.append({
                "thumbnail": thumbnail,
                "name": name,
                "link": link,
                "author_id": author_id,
                "email": email,
                "affiliations": affiliations,
                "cited_by": cited_by,
                "interests": interests
            })
        if "next" in profile_results.get("pagination", []):
            search.params_dict.update(dict(parse_qsl(urlsplit(profile_results.get("pagination").get("next")).query)))
        else:
            profiles_is_present = False
    return profile_results_data

def author_results():
    print("extracting author results..")
    author_results_data = []
    for author_id in profile_results():
        print(f"Parsing {author_id['author_id']} author ID.")
        params = {
            "api_key": os.getenv("API_KEY"),      # SerpApi API key
            "engine": "google_scholar_author",    # author results search engine
            "author_id": author_id["author_id"],  # search query
            "hl": "en"
        }
        search = GoogleSearch(params)
        results = search.get_dict()
        thumbnail = results.get("author").get("thumbnail")
        name = results.get("author").get("name")
        affiliations = results.get("author").get("affiliations")
        email = results.get("author").get("email")
        website = results.get("author").get("website")
        interests = results.get("author").get("interests")
        cited_by_table = results.get("cited_by", {}).get("table")
        cited_by_graph = results.get("cited_by", {}).get("graph")
        public_access_link = results.get("public_access", {}).get("link")
        available_public_access = results.get("public_access", {}).get("available")
        not_available_public_access = results.get("public_access", {}).get("not_available")
        co_authors = results.get("co_authors")
        author_results_data.append({
            "thumbnail": thumbnail,
            "name": name,
            "affiliations": affiliations,
            "email": email,
            "website": website,
            "interests": interests,
            "cited_by_table": cited_by_table,
            "cited_by_graph": cited_by_graph,
            "public_access_link": public_access_link,
            "available_public_access": available_public_access,
            "not_available_public_access": not_available_public_access,
            "co_authors": co_authors
        })
    return author_results_data

def all_author_articles():
    author_article_results_data = []
    for index, author_id in enumerate(profile_results(), start=1):
        print(f"Parsing author #{index} with {author_id['author_id']} author ID.")
        params = {
            "api_key": os.getenv("API_KEY"),     # SerpApi API key
            "engine": "google_scholar_author",   # author results search engine
            "hl": "en",                          # language
            "sort": "pubdate",                   # sort by year
            "author_id": author_id["author_id"]  # search query
        }
        search = GoogleSearch(params)
        articles_is_present = True
        while articles_is_present:
            results = search.get_dict()
            for article in results.get("articles", []):
                title = article.get("title")
                link = article.get("link")
                citation_id = article.get("citation_id")
                authors = article.get("authors")
                publication = article.get("publication")
                cited_by_value = article.get("cited_by", {}).get("value")
                cited_by_link = article.get("cited_by", {}).get("link")
                cited_by_cites_id = article.get("cited_by", {}).get("cites_id")
                year = article.get("year")
                author_article_results_data.append({
                    "article_title": title,
                    "article_link": link,
                    "article_year": year,
                    "article_citation_id": citation_id,
                    "article_authors": authors,
                    "article_publication": publication,
                    "article_cited_by_value": cited_by_value,
                    "article_cited_by_link": cited_by_link,
                    "article_cited_by_cites_id": cited_by_cites_id,
                })
            if "next" in results.get("serpapi_pagination", []):
                search.params_dict.update(dict(parse_qsl(urlsplit(results.get("serpapi_pagination").get("next")).query)))
            else:
                articles_is_present = False
    return author_article_results_data

def save_author_result_to_csv():
    print("Waiting for author results to save..")
    pd.DataFrame(data=profile_results()).to_csv("google_scholar_author_results.csv", encoding="utf-8", index=False)
    print("Author Results Saved.")

def save_author_articles_to_csv():
    print("Waiting for author articles to save..")
    pd.DataFrame(data=profile_results()).to_csv("google_scholar_author_articles.csv", encoding="utf-8", index=False)
    print("Author Articles Saved.")

def save_profile_results_to_csv():
    print("Waiting for profile results to save..")
    pd.DataFrame(data=profile_results()).to_csv("google_scholar_profile_results.csv", encoding="utf-8", index=False)
    print("Profile Results Saved.")

Ссылки

Outro

Если ваша цель — извлечь данные без необходимости писать парсер с нуля, разобраться, как обойти блокировки поисковых систем, как его масштабировать или как извлечь данные из JavaScript — попробуйте SerpApi.

Если у вас есть чем поделиться, какие-либо вопросы, предложения или что-то, что работает неправильно, свяжитесь с нами через Twitter по адресу @dimitryzub или @serp_api.

Присоединяйтесь к нам на Reddit | Твиттер | "YouTube"

Добавьте Запрос функции💫 или Ошибку🐞

Больше контента на plainenglish.io. Подпишитесь на нашу бесплатную еженедельную рассылку новостей. Получите эксклюзивный доступ к возможностям написания и советам в нашем сообществе Discord.