Ver a proveniência

Re-add logging, rem file filters.

Add:
    bitshift/
        __init__.py
            -add `_configure_logging()`, which sets up a more robust logging
            infrastructure than was previously used: log files are rotated once
            per hour, and have some additional formatting rules.

        (crawler, indexer).py
            -add hierarchically-descending loggers to individual threaded
            classes (`GitHubCrawler`, `GitIndexer`, etc.); add logging calls.

        indexer.py
            -remove file filtering regex matches from `_get_tracked_files()`,
            as non-code files will be discarded by the parsers.
tags/v1.0^2
Severyn Kozak há 10 anos
ascendente
cometimento
6762c1fa3d
3 ficheiros alterados com 114 adições e 73 eliminações
  1. +24
    -1
      bitshift/crawler/__init__.py
  2. +35
    -11
      bitshift/crawler/crawler.py
  3. +55
    -61
      bitshift/crawler/indexer.py

+ 24
- 1
bitshift/crawler/__init__.py Ver ficheiro

@@ -4,7 +4,7 @@
Contains functions for initializing all subsidiary, threaded crawlers.
"""

import os, Queue
import logging, logging.handlers, os, Queue

from bitshift.crawler import crawler, indexer

@@ -20,6 +20,8 @@ def crawl():
3. Git indexer, :class:`bitshift.crawler.indexer.GitIndexer`.
"""

_configure_logging()

MAX_URL_QUEUE_SIZE = 5e3

repo_clone_queue = Queue.Queue(maxsize=MAX_URL_QUEUE_SIZE)
@@ -29,3 +31,24 @@ def crawl():

for thread in threads:
thread.start()

def _configure_logging():
LOG_FILE_DIR = "log"

if not os.path.exists(LOG_FILE_DIR):
os.mkdir(LOG_FILE_DIR)

logging.getLogger("requests").setLevel(logging.WARNING)

formatter = logging.Formatter(
fmt=("%(asctime)s %(levelname)s %(name)s %(funcName)s"
" %(message)s"), datefmt="%y-%m-%d %H:%M:%S")

handler = logging.handlers.TimedRotatingFileHandler(
"%s/%s" % (LOG_FILE_DIR, "app.log"), when="H", interval=1,
backupCount=20)
handler.setFormatter(formatter)

root_logger = logging.getLogger()
root_logger.addHandler(handler)
root_logger.setLevel(logging.NOTSET)

+ 35
- 11
bitshift/crawler/crawler.py Ver ficheiro

@@ -4,7 +4,7 @@
Contains all website/framework-specific Class crawlers.
"""

import requests, time, threading
import logging, requests, time, threading

from bitshift.crawler import indexer

@@ -22,6 +22,7 @@ class GitHubCrawler(threading.Thread):
:ivar clone_queue: (:class:`Queue.Queue`) Contains :class:`GitRepository`
with repository metadata retrieved by :class:`GitHubCrawler`, and other Git
crawlers, to be processed by :class:`indexer.GitIndexer`.
:ivar _logger: (:class:`logging.Logger`) A class-specific logger object.
"""

AUTHENTICATION = {
@@ -39,6 +40,9 @@ class GitHubCrawler(threading.Thread):
"""

self.clone_queue = clone_queue
self._logger = logging.getLogger("%s.%s" %
(__name__, self.__class__.__name__))
self._logger.info("Starting.")
super(GitHubCrawler, self).__init__(name=self.__class__.__name__)

def run(self):
@@ -61,11 +65,17 @@ class GitHubCrawler(threading.Thread):
try:
response = requests.get(next_api_url,
params=self.AUTHENTICATION)
except ConnectionError as exception:
except ConnectionError as excep:
self._logger.warning("API %s call failed: %s: %s",
next_api_url, excep.__class__.__name__, excep)
time.sleep(0.5)
continue

queue_percent_full = (float(self.clone_queue.qsize()) /
self.clone_queue.maxsize) * 100
self._logger.info("API call made. Queue size: %d/%d, %d%%." %
((self.clone_queue.qsize(), self.clone_queue.maxsize,
queue_percent_full)))

for repo in response.json():
while self.clone_queue.full():
@@ -73,15 +83,15 @@ class GitHubCrawler(threading.Thread):

self.clone_queue.put(indexer.GitRepository(
repo["html_url"], repo["full_name"].replace("/", ""),
"GitHub"))
"GitHub",
#self._get_repo_stars(repo["full_name"]))
0))

if int(response.headers["x-ratelimit-remaining"]) == 0:
time.sleep(int(response.headers["x-ratelimit-reset"]) -
time.time())

next_api_url = response.headers["link"].split(">")[0][1:]
with open(".github_api.log", "w") as log_file:
log_file.write("%s\n" % next_api_url)

sleep_time = api_request_interval - (time.time() - start_time)
if sleep_time > 0:
@@ -105,7 +115,6 @@ class GitHubCrawler(threading.Thread):

API_URL = "https://api.github.com/search/repositories"


params = self.AUTHENTICATION
params["q"] = "repo:%s" % repo_name

@@ -116,9 +125,18 @@ class GitHubCrawler(threading.Thread):
})

if int(resp.headers["x-ratelimit-remaining"]) == 0:
time.sleep(int(resp.headers["x-ratelimit-reset"]) - time.time())
sleep_time = int(resp.headers["x-ratelimit-reset"]) - time.time()
if sleep_time > 0:
logging.info("API quota exceeded. Sleep time: %d." % sleep_time)
time.sleep(sleep_time)

return int(resp.json()["items"][0]["stargazers_count"])
if "items" not in resp.json() or len(resp.json()["items"]) == 0:
self._logger.critical("No API result: %s. Result: %s" % (resp.url,
str(resp.json())))
return 0
else:
rank = float(resp.json()["items"][0]["stargazers_count"]) / 1000
return rank if rank < 1.0 else 1.0

class BitbucketCrawler(threading.Thread):
"""
@@ -131,6 +149,7 @@ class BitbucketCrawler(threading.Thread):

:ivar clone_queue: (:class:`Queue.Queue`) The shared queue to insert
:class:`indexer.GitRepository` repository urls into.
:ivar _logger: (:class:`logging.Logger`) A class-specific logger object.
"""

def __init__(self, clone_queue):
@@ -143,6 +162,9 @@ class BitbucketCrawler(threading.Thread):
"""

self.clone_queue = clone_queue
self._logger = logging.getLogger("%s.%s" %
(__name__, self.__class__.__name__))
self._logger.info("Starting.")
super(BitbucketCrawler, self).__init__(name=self.__class__.__name__)

def run(self):
@@ -162,10 +184,15 @@ class BitbucketCrawler(threading.Thread):
response = requests.get(next_api_url).json()
except ConnectionError as exception:
time.sleep(0.5)
self._logger.warning("API %s call failed: %s: %s",
next_api_url, excep.__class__.__name__, excep)
continue

queue_percent_full = (float(self.clone_queue.qsize()) /
self.clone_queue.maxsize) * 100
self._logger.info("API call made. Queue size: %d/%d, %d%%." %
((self.clone_queue.qsize(), self.clone_queue.maxsize,
queue_percent_full)))

for repo in response["values"]:
if repo["scm"] == "git":
@@ -181,7 +208,4 @@ class BitbucketCrawler(threading.Thread):
clone_url, repo["full_name"], "Bitbucket"))

next_api_url = response["next"]
with open(".bitbucket_api.log", "w") as log_file:
log_file.write("%s\n" % next_api_url)

time.sleep(0.2)

+ 55
- 61
bitshift/crawler/indexer.py Ver ficheiro

@@ -3,7 +3,8 @@
repositories.
"""

import bs4, os, Queue, re, shutil, string, subprocess, time, threading
import bs4, datetime, logging, os, Queue, re, shutil, string, subprocess, time,\
threading

from ..database import Database
from ..codelet import Codelet
@@ -11,6 +12,9 @@ from ..codelet import Codelet
GIT_CLONE_DIR = "/tmp/bitshift"
THREAD_QUEUE_SLEEP = 0.5

import pymongo #debug
db = pymongo.MongoClient().bitshift #debug

class GitRepository(object):
"""
A representation of a Git repository's metadata.
@@ -19,24 +23,29 @@ class GitRepository(object):
:ivar name: (str) The name of the repository.
:ivar framework_name: (str) The name of the online Git framework that the
repository belongs to (eg, GitHub, BitBucket).
:ivar rank: (float) The rank of the repository, as assigned by
:class:`crawler.GitHubCrawler`.
"""

def __init__(self, url, name, framework_name):
def __init__(self, url, name, framework_name, rank):
"""
Create a GitRepository instance.

:param url: see :attr:`GitRepository.url`
:param name: see :attr:`GitRepository.name`
:param framework_name: see :attr:`GitRepository.framework_name`
:param rank: see :attr:`GitRepository.rank`

:type url: str
:type name: str
:type framework_name: str
:type rank: float
"""

self.url = url
self.name = name
self.framework_name = framework_name
self.rank = rank

class GitIndexer(threading.Thread):
"""
@@ -50,6 +59,7 @@ class GitIndexer(threading.Thread):
cloned by :class:`_GitCloner`, which are to be indexed.
:ivar git_cloner: (:class:`_GitCloner`) The corresponding repository cloner,
which feeds :class:`GitIndexer`.
:ivar _logger: (:class:`logging.Logger`) A class-specific logger object.
"""

def __init__(self, clone_queue):
@@ -66,6 +76,9 @@ class GitIndexer(threading.Thread):
self.index_queue = Queue.Queue(maxsize=MAX_INDEX_QUEUE_SIZE)
self.git_cloner = _GitCloner(clone_queue, self.index_queue)
self.git_cloner.start()
self._logger = logging.getLogger("%s.%s" %
(__name__, self.__class__.__name__))
self._logger.info("Starting.")

if not os.path.exists(GIT_CLONE_DIR):
os.makedirs(GIT_CLONE_DIR)
@@ -88,52 +101,43 @@ class GitIndexer(threading.Thread):

repo = self.index_queue.get()
self.index_queue.task_done()
try:
self._index_repository(repo.url, repo.name, repo.framework_name)
except Exception as exception:
pass
# try:
self._index_repository(repo)
# except Exception as excep:
# self._logger.warning("%s: %s.", excep.__class__.__name__, excep)

def _index_repository(self, repo_url, repo_name, framework_name):
def _index_repository(self, repo):
"""
Clone and index (create and insert Codeletes for) a Git repository.

`git clone` the Git repository located at **repo_url**, call
_insert_repository_codelets, then remove said repository.
`git clone` the Git repository located at **repo.url**, call
`_insert_repository_codelets()`, then remove said repository.

:param repo_url: The url the Git repository was cloned from.
:param repo_name: The name of the repository.
:param framework_name: The name of the framework the repository is from.
:param repo_url: The metadata of the repository to be indexed.

:type repo_url: str
:type repo_name: str
:type framework_name: str
:type repo_url: :class:`GitRepository`
"""

with _ChangeDir("%s/%s" % (GIT_CLONE_DIR, repo_name)) as repository_dir:
try:
self._insert_repository_codelets(repo_url, repo_name,
framework_name)
except Exception as exception:
pass
with _ChangeDir("%s/%s" % (GIT_CLONE_DIR, repo.name)) as repository_dir:
# try:
self._insert_repository_codelets(repo)
# except Exception as excep:
# self._logger.warning("%s: %s.", excep.__class__.__name__, excep)

if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo_name)):
shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name))
if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.name)):
shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.name))

def _insert_repository_codelets(self, repo_url, repo_name, framework_name):
def _insert_repository_codelets(self, repo):
"""
Create and insert a Codelet for the files inside a Git repository.

Create a new Codelet, and insert it into the Database singleton, for every
file inside the current working directory's default branch (usually
*master*).
Create a new Codelet, and insert it into the Database singleton, for
every file inside the current working directory's default branch
(usually *master*).

:param repo_url: The url the Git repository was cloned from.
:param repo_name: The name of the repository.
:param framework_name: The name of the framework the repository is from.
:param repo_url: The metadata of the repository to be indexed.

:type repo_url: str
:type repo_name: str
:type framework_name: str
:type repo_url: :class:`GitRepository`
"""

commits_meta = _get_commits_metadata()
@@ -142,7 +146,6 @@ class GitIndexer(threading.Thread):

for filename in commits_meta.keys():
try:
source = ""
with open(filename) as source_file:
source = _decode(source_file.read())
if source is None:
@@ -152,13 +155,14 @@ class GitIndexer(threading.Thread):

authors = [(_decode(author),) for author in \
commits_meta[filename]["authors"]]
codelet = Codelet("%s:%s" % (repo_name, filename), source, filename,
None, authors, _generate_file_url(filename, repo_url,
framework_name),
codelet = Codelet("%s:%s" % (repo.name, filename), source, filename,
None, authors, _generate_file_url(filename,
repo.url, repo.framework_name),
commits_meta[filename]["time_created"],
commits_meta[filename]["time_last_modified"])
commits_meta[filename]["time_last_modified"],
repo.rank)

# Database.insert(codelet)
db.codelets.insert(codelet.__dict__) #debug

class _GitCloner(threading.Thread):
"""
@@ -171,6 +175,7 @@ class _GitCloner(threading.Thread):
:attr:`crawler.GitHubCrawler.clone_queue`.
:ivar index_queue: (:class:`Queue.Queue`) see
:attr:`GitIndexer.index_queue`.
:ivar _logger: (:class:`logging.Logger`) A class-specific logger object.
"""

def __init__(self, clone_queue, index_queue):
@@ -186,6 +191,9 @@ class _GitCloner(threading.Thread):

self.clone_queue = clone_queue
self.index_queue = index_queue
self._logger = logging.getLogger("%s.%s" %
(__name__, self.__class__.__name__))
self._logger.info("Starting.")
super(_GitCloner, self).__init__(name=self.__class__.__name__)

def run(self):
@@ -339,11 +347,11 @@ def _get_git_commits():
sample_returned_array = [
{
"author" : (str) "author"
"timestamp" : (int) 1396919293,
"timestamp" : (`datetime.datetime`) <object>,
"filenames" : (str array) ["file1", "file2"]
}
]
:rtype: dictionary
:rtype: array of dictionaries
"""

git_log = subprocess.check_output(("git --no-pager log --name-only"
@@ -355,7 +363,7 @@ def _get_git_commits():
if len(fields) > 2:
commits.append({
"author" : fields[0],
"timestamp" : int(fields[1]),
"timestamp" : datetime.datetime.fromtimestamp(int(fields[1])),
"filenames" : fields[2].split("\x00")[:-2]
})

@@ -374,28 +382,14 @@ def _get_tracked_files():
:rtype: str array
"""

GIT_IGNORE_FILES = [".*licen[cs]e.*", ".*readme.*"]
GIT_IGNORE_EXTENSIONS = ["t[e]?xt(ile)?", "m(ark)?down", "mkd[n]?",
"md(wn|t[e]?xt)?", "rst"]

files = []
for dirname, subdir_names, filenames in os.walk("."):
for filename in filenames:
path = os.path.join(dirname, filename)
if _is_ascii(path):
files.append(path)

valuable_files = []
for filename in files:
filename_match = any([re.match(pattern, filename, flags=re.IGNORECASE)
for pattern in GIT_IGNORE_FILES])
extension = filename.split(".")[-1]
extension_match = any([re.match(pattern, filename, flags=re.IGNORECASE)
for pattern in GIT_IGNORE_EXTENSIONS])
files.append(path[2:])

if not (filename_match or extension_match):
valuable_files.append(filename[2:])
return valuable_files
return files

def _get_commits_metadata():
"""
@@ -407,11 +401,11 @@ def _get_commits_metadata():
sample_returned_dict = {
"my_file" : {
"authors" : (str array) ["author1", "author2"],
"time_created" : (int) 1395939566,
"time_last_modified" : (int) 1396920409
"time_created" : (`datetime.datetime`) <object>,
"time_last_modified" : (`datetime.datetime`) <object>
}
}
:rtype: dictionary
:rtype: dictionary of dictionaries
"""

commits = _get_git_commits()


Carregando…
Cancelar
Guardar