-
Notifications
You must be signed in to change notification settings - Fork 3.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix libraries performance problem #34523
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,9 +4,9 @@ | |
""" | ||
|
||
|
||
from collections import defaultdict | ||
import logging | ||
from abc import ABCMeta, abstractmethod | ||
from collections import defaultdict | ||
from contextlib import contextmanager | ||
|
||
from django.contrib.auth.models import User # lint-amnesty, pylint: disable=imported-auth-user | ||
|
@@ -23,6 +23,9 @@ | |
# A mapping of roles to the roles that they inherit permissions from. | ||
ACCESS_ROLES_INHERITANCE = {} | ||
|
||
# The key used to store roles for a user in the cache that do not belong to a course or do not have a course id. | ||
ROLE_CACHE_UNGROUPED_ROLES__KEY = 'ungrouped' | ||
|
||
|
||
def register_access_role(cls): | ||
""" | ||
|
@@ -60,21 +63,52 @@ def strict_role_checking(): | |
ACCESS_ROLES_INHERITANCE.update(OLD_ACCESS_ROLES_INHERITANCE) | ||
|
||
|
||
def get_role_cache_key_for_course(course_key=None): | ||
""" | ||
Get the cache key for the course key. | ||
""" | ||
return str(course_key) if course_key else ROLE_CACHE_UNGROUPED_ROLES__KEY | ||
|
||
|
||
class BulkRoleCache: # lint-amnesty, pylint: disable=missing-class-docstring | ||
""" | ||
This class provides a caching mechanism for roles grouped by users and courses, | ||
using a nested dictionary structure to optimize lookup performance. The cache structure is designed as follows: | ||
|
||
{ | ||
user_id_1: { | ||
course_id_1: {role1, role2, role3}, # Set of roles associated with course_id_1 | ||
course_id_2: {role4, role5, role6}, # Set of roles associated with course_id_2 | ||
[ROLE_CACHE_UNGROUPED_ROLES_KEY]: {role7, role8} # Set of roles not tied to any specific course or library | ||
}, | ||
user_id_2: { ... } # Similar structure for another user | ||
} | ||
|
||
- Each top-level dictionary entry keys by `user_id` to access role data for a specific user. | ||
- Nested within each user's dictionary, entries are keyed by `course_id` grouping roles by course. | ||
- The special key `ROLE_CACHE_UNGROUPED_ROLES_KEY` (a constant defined above) | ||
stores roles that are not associated with any specific course or library. | ||
""" | ||
|
||
CACHE_NAMESPACE = "student.roles.BulkRoleCache" | ||
CACHE_KEY = 'roles_by_user' | ||
|
||
@classmethod | ||
def prefetch(cls, users): # lint-amnesty, pylint: disable=missing-function-docstring | ||
roles_by_user = defaultdict(set) | ||
roles_by_user = defaultdict(lambda: defaultdict(set)) | ||
get_cache(cls.CACHE_NAMESPACE)[cls.CACHE_KEY] = roles_by_user | ||
|
||
for role in CourseAccessRole.objects.filter(user__in=users).select_related('user'): | ||
ormsbee marked this conversation as resolved.
Show resolved
Hide resolved
|
||
roles_by_user[role.user.id].add(role) | ||
user_id = role.user.id | ||
course_id = get_role_cache_key_for_course(role.course_id) | ||
|
||
# Add role to the set in roles_by_user[user_id][course_id] | ||
user_roles_set_for_course = roles_by_user[user_id][course_id] | ||
user_roles_set_for_course.add(role) | ||
|
||
users_without_roles = [u for u in users if u.id not in roles_by_user] | ||
for user in users_without_roles: | ||
roles_by_user[user.id] = set() | ||
roles_by_user[user.id] = {} | ||
|
||
@classmethod | ||
def get_user_roles(cls, user): | ||
|
@@ -83,15 +117,32 @@ def get_user_roles(cls, user): | |
|
||
class RoleCache: | ||
""" | ||
A cache of the CourseAccessRoles held by a particular user | ||
A cache of the CourseAccessRoles held by a particular user. | ||
Internal data structures should be accessed by getter and setter methods; | ||
don't use `_roles_by_course_id` or `_roles` directly. | ||
_roles_by_course_id: This is the data structure as saved in the RequestCache. | ||
It contains all roles for a user as a dict that's keyed by course_id. | ||
The key ROLE_CACHE_UNGROUPED_ROLES__KEY is used for all roles | ||
that are not associated with a course. | ||
_roles: This is a set of all roles for a user, ungrouped. It's used for some types of | ||
lookups and collected from _roles_by_course_id on initialization | ||
so that it doesn't need to be recalculated. | ||
|
||
""" | ||
def __init__(self, user): | ||
try: | ||
self._roles = BulkRoleCache.get_user_roles(user) | ||
self._roles_by_course_id = BulkRoleCache.get_user_roles(user) | ||
except KeyError: | ||
self._roles = set( | ||
CourseAccessRole.objects.filter(user=user).all() | ||
) | ||
self._roles_by_course_id = {} | ||
roles = CourseAccessRole.objects.filter(user=user).all() | ||
for role in roles: | ||
course_id = get_role_cache_key_for_course(role.course_id) | ||
if not self._roles_by_course_id.get(course_id): | ||
self._roles_by_course_id[course_id] = set() | ||
self._roles_by_course_id[course_id].add(role) | ||
self._roles = set() | ||
for roles_for_course in self._roles_by_course_id.values(): | ||
self._roles.update(roles_for_course) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Question: Shouldn't this kind of refresh logic be delegated to something in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe it should. The BulkRoleCache is responsible for roles in general it seems and this is scoped to a specific user's roles, so you could maybe make some case for it to stay here. The refresh happened here originally as well, so I tried to change as few things as I could to avoid breaking stuff. With your guidance here I could see about moving this logic to the BulkRoleCache in some way, potentially, but maybe that's rather part of a separate need to work on the RoleCache in general: Generally I wasn't very impressed with the design of the RolesCache and BulkRoleCache, given that
I could probably find some more things that bother me. It probably would make sense to improve the architecture of this RoleCache in a separate issue, though. I'd love to see that. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I'd have to dig into the commit history, but keep in mind that things like this sometimes spring up as a series of bandaids that deal with specific bugs/issues over time, and don't necessarily add up to a coherent design intent. (Though sometimes they do, and those can be mind-bending. 😛 ) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I think this is part of the early days of fumbling around as to how to cache something at the request level. There's no real built-in thing for this in Django, but there is a User object that's passed into the view as part of the Request, and that Request can be munged with in middleware. So if you want to stick some data onto a user for the duration of the request, you can slap it in there. Not something I'd recommend as a good practice, but I'd bet that's where it started from. |
||
|
||
@staticmethod | ||
def get_roles(role): | ||
|
@@ -100,16 +151,24 @@ def get_roles(role): | |
""" | ||
return ACCESS_ROLES_INHERITANCE.get(role, set()) | {role} | ||
|
||
@property | ||
def all_roles_set(self): | ||
return self._roles | ||
|
||
@property | ||
def roles_by_course_id(self): | ||
return self._roles_by_course_id | ||
|
||
def has_role(self, role, course_id, org): | ||
""" | ||
Return whether this RoleCache contains a role with the specified role | ||
or a role that inherits from the specified role, course_id and org. | ||
""" | ||
course_id_string = get_role_cache_key_for_course(course_id) | ||
course_roles = self._roles_by_course_id.get(course_id_string, []) | ||
return any( | ||
access_role.role in self.get_roles(role) and | ||
access_role.course_id == course_id and | ||
access_role.org == org | ||
for access_role in self._roles | ||
access_role.role in self.get_roles(role) and access_role.org == org | ||
for access_role in course_roles | ||
) | ||
|
||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This also raises my eyebrows performance-wise, although I might just be mis-understanding it.
roles_by_user
is a dict which is stored in the cache, and the next few lines populate the dict. Does every update toroles_by_user
trigger a cache-write? Or does the code somehow know that it needs to bulk-update theroles_by_user
cache entry once this method finishesThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this works because it's a Request cache, so just an in-memory dict with no write-through to an external cache server.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Got it, ty
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are you referring to the existing code or to my code change here?
I have to say I was quite confused by the
prefetch
method as well - the lineget_cache(cls.CACHE_NAMESPACE)[cls.CACHE_KEY] = roles_by_user
is a pretty weird looking way to write to a cache, especially since the RequestCache has explicit set operations.I was wondering if this even works and if so I highly doubt that this is good practice.
However my change was only to change the datastructure in the cache for each user from a set to a dict, so I didn't want to touch this to avoid breaking anything. But I was going to ask you.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, I asked in our python slack about it and they confirmed that the RequestCache is pretty much just a writable dict. It's still weird to me that you'd do this instead of using it's explicit setter method.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is very likely that this usage of RequestCache just predates RequestCache having a setter method.