Просмотр исходного кода

Test crawler, complete documentation.

Add, Fix:
    bitshift/crawler/
        __init__.py
            -add module and crawl() docstrings.
            -add repository_queue size limit.

        crawler.py
            -account for time spent executing an API query in the run() loop
            sleep() interval.
tags/v1.0^2
Severyn Kozak 10 лет назад
Родитель
Сommit
b680756f8d
3 измененных файлов: 91 добавлений и 33 удалений
  1. +17
    -1
      bitshift/crawler/__init__.py
  2. +74
    -32
      bitshift/crawler/crawler.py
  3. +0
    -0
      bitshift/crawler/indexer.py

+ 17
- 1
bitshift/crawler/__init__.py Просмотреть файл

@@ -1,3 +1,9 @@
"""
:synopsis: Parent crawler module, which supervises all crawlers.

Contains functions for initializing all subsidiary, threaded crawlers.
"""

import Queue

from bitshift.crawler import crawler
@@ -5,8 +11,18 @@ from bitshift.crawler import git_indexer

__all__ = ["crawl"]

MAX_URL_QUEUE_SIZE = 5e3

def crawl():
repository_queue = Queue.Queue()
"""
Initialize all crawlers (and indexers).

Start the:
1. GitHub crawler, :class:`bitshift.crawler.crawler.GitHubCrawler`
2. Git indexer, :class:`bitshift.crawler.git_indexer.GitIndexer`
"""

repository_queue = Queue.Queue(maxsize=MAX_URL_QUEUE_SIZE)
github_crawler = crawler.GitHubCrawler(repository_queue)
indexer = git_indexer.GitIndexer(repository_queue)



+ 74
- 32
bitshift/crawler/crawler.py Просмотреть файл

@@ -12,46 +12,88 @@ from ..codelet import Codelet
from ..database import Database

class GitHubCrawler(threading.Thread):
"""
Crawler that retrieves links to all of GitHub's public repositories.

GitHubCrawler is a threaded singleton that queries GitHub's API for URLs
to its public repositories, which it inserts into a :class:`Queue.Queue`
shared with :class:`bitshift.crawler.git_indexer.GitIndexer`.

:ivar repository_queue: (:class:`Queue.Queue`) Contains dictionaries with
repository information retrieved by `GitHubCrawler`, and other Git
crawlers, to be processed by
:class:`bitshift.crawler.git_indexer.GitIndexer`.
"""

def __init__(self, repository_queue):
"""
Create an instance of the singleton `GitHubCrawler`.

:param repository_queue: A queue containing dictionaries of repository
metadata retrieved by `GitHubCrawler`, meant to be processed by an
instance of :class:`bitshift.crawler.git_indexer.GitIndexer`.

.. code-block:: python
sample_dict = {
"url" : "https://github.com/user/repo",
"name" : "repo",
"framework_name" : "GitHub"
}

:type repository_queue: :class:`Queue.Queue`
"""


self.repository_queue = repository_queue
super(GitHubCrawler, self).__init__()

def run():
_github()
def run(self):
"""
Query the GitHub API for data about every public repository.

def _github():
"""
Query the GitHub API for data about every public repository.
Pull all of GitHub's repositories by making calls to its API in a loop,
accessing a subsequent page of results via the "next" URL returned in an
API response header. Uses Severyn Kozak's (sevko) authentication
credentials.
"""

Pull all of GitHub's repositories by making calls to its API in a loop,
accessing a subsequent page of results via the "next" URL returned in an
API response header. Uses Severyn Kozak's (sevko) authentication
credentials.
"""
next_api_url = "https://api.github.com/repositories"
authentication_params = {
"client_id" : "436cb884ae09be7f2a4e",
"client_secret" : "8deeefbc2439409c5b7a092fd086772fe8b1f24e"
}
api_request_interval = 5e3 / 60 ** 2

next_api_url = "https://api.github.com/repositories"
authentication_params = {
"client_id" : "436cb884ae09be7f2a4e",
"client_secret" : "8deeefbc2439409c5b7a092fd086772fe8b1f24e"
}
api_request_interval = 5e3 / 60 ** 2

while len(next_api_url) > 0:
start_time = time.time()
response = requests.get(next_api_url, params=authentication_params)

for repo in response.json():
self.repository_queue.put({
"url" : repo["html_url"],
"framework_name" : "GitHub"
while len(next_api_url) > 0:
# DEBUG
db.log.insert({
"time" : str(time.time()).split(".")[0][-4:],
"qsize" : self.repository_queue.qsize()
})
self.repository_queue.task_done()

if int(response.headers["x-ratelimit-remaining"]) == 0:
time.sleep(int(response.headers["x-ratelimit-reset"]) - time.time())
start_time = time.time()
response = requests.get(next_api_url, params=authentication_params)

for repo in response.json():
logging.basicConfig(filename="crawler.log", level=logging.DEBUG)
logging.debug("crawler: %-20s: %-5s: %-5s: %s",
str(time.time()).split(".")[0],
self.repository_queue.qsize(), repo["id"],
repo["name"])
while self.repository_queue.full():
pass
self.repository_queue.put({
"url" : repo["html_url"],
"name" : repo["html_url"].split("/")[-1],
"framework_name" : "GitHub"
})

if int(response.headers["x-ratelimit-remaining"]) == 0:
time.sleep(int(response.headers["x-ratelimit-reset"]) -
time.time())

next_api_url = response.headers["link"].split(">")[0][1:]
next_api_url = response.headers["link"].split(">")[0][1:]

sleep_time = api_request_interval - (time.time() - start_time)
if sleep_time > 0:
time.sleep(sleep_time)
sleep_time = api_request_interval - (time.time() - start_time)
if sleep_time > 0:
time.sleep(sleep_time)

bitshift/crawler/git_indexer.py → bitshift/crawler/indexer.py Просмотреть файл


Загрузка…
Отмена
Сохранить