Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix libraries performance problem #34523

Merged
merged 1 commit into from
May 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cms/djangoapps/contentstore/rest_api/v1/views/home.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,4 +215,5 @@ def get(self, request: Request):

library_context = get_library_context(request)
serializer = LibraryTabSerializer(library_context)

return Response(serializer.data)
2 changes: 1 addition & 1 deletion cms/djangoapps/contentstore/views/course.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@
CourseStaffRole,
GlobalStaff,
UserBasedRole,
OrgStaffRole
OrgStaffRole,
)
from common.djangoapps.util.json_request import JsonResponse, JsonResponseBadRequest, expect_json
from common.djangoapps.util.string_utils import _has_non_ascii_characters
Expand Down
3 changes: 2 additions & 1 deletion common/djangoapps/student/auth.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
OrgContentCreatorRole,
OrgInstructorRole,
OrgLibraryUserRole,
OrgStaffRole
OrgStaffRole,
)

# Studio permissions:
Expand Down Expand Up @@ -106,6 +106,7 @@ def get_user_permissions(user, course_key, org=None):
# Staff have all permissions except EDIT_ROLES:
if OrgStaffRole(org=org).has_user(user) or (course_key and user_has_role(user, CourseStaffRole(course_key))):
return STUDIO_VIEW_USERS | STUDIO_EDIT_CONTENT | STUDIO_VIEW_CONTENT

# Otherwise, for libraries, users can view only:
if course_key and isinstance(course_key, LibraryLocator):
if OrgLibraryUserRole(org=org).has_user(user) or user_has_role(user, LibraryUserRole(course_key)):
Expand Down
2 changes: 1 addition & 1 deletion common/djangoapps/student/role_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,4 +75,4 @@ def get_course_roles(user: User) -> list[CourseAccessRole]:
"""
# pylint: disable=protected-access
role_cache = get_role_cache(user)
return list(role_cache._roles)
return list(role_cache.all_roles_set)
85 changes: 72 additions & 13 deletions common/djangoapps/student/roles.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
"""


from collections import defaultdict
import logging
from abc import ABCMeta, abstractmethod
from collections import defaultdict
from contextlib import contextmanager

from django.contrib.auth.models import User # lint-amnesty, pylint: disable=imported-auth-user
Expand All @@ -23,6 +23,9 @@
# A mapping of roles to the roles that they inherit permissions from.
ACCESS_ROLES_INHERITANCE = {}

# The key used to store roles for a user in the cache that do not belong to a course or do not have a course id.
ROLE_CACHE_UNGROUPED_ROLES__KEY = 'ungrouped'


def register_access_role(cls):
"""
Expand Down Expand Up @@ -60,21 +63,52 @@ def strict_role_checking():
ACCESS_ROLES_INHERITANCE.update(OLD_ACCESS_ROLES_INHERITANCE)


def get_role_cache_key_for_course(course_key=None):
"""
Get the cache key for the course key.
"""
return str(course_key) if course_key else ROLE_CACHE_UNGROUPED_ROLES__KEY


class BulkRoleCache: # lint-amnesty, pylint: disable=missing-class-docstring
"""
This class provides a caching mechanism for roles grouped by users and courses,
using a nested dictionary structure to optimize lookup performance. The cache structure is designed as follows:

{
user_id_1: {
course_id_1: {role1, role2, role3}, # Set of roles associated with course_id_1
course_id_2: {role4, role5, role6}, # Set of roles associated with course_id_2
[ROLE_CACHE_UNGROUPED_ROLES_KEY]: {role7, role8} # Set of roles not tied to any specific course or library
},
user_id_2: { ... } # Similar structure for another user
}

- Each top-level dictionary entry keys by `user_id` to access role data for a specific user.
- Nested within each user's dictionary, entries are keyed by `course_id` grouping roles by course.
- The special key `ROLE_CACHE_UNGROUPED_ROLES_KEY` (a constant defined above)
stores roles that are not associated with any specific course or library.
"""

CACHE_NAMESPACE = "student.roles.BulkRoleCache"
CACHE_KEY = 'roles_by_user'

@classmethod
def prefetch(cls, users): # lint-amnesty, pylint: disable=missing-function-docstring
roles_by_user = defaultdict(set)
roles_by_user = defaultdict(lambda: defaultdict(set))
get_cache(cls.CACHE_NAMESPACE)[cls.CACHE_KEY] = roles_by_user
Copy link
Member

@kdmccormick kdmccormick Apr 25, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This also raises my eyebrows performance-wise, although I might just be mis-understanding it.

roles_by_user is a dict which is stored in the cache, and the next few lines populate the dict. Does every update to roles_by_user trigger a cache-write? Or does the code somehow know that it needs to bulk-update the roles_by_user cache entry once this method finishes

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this works because it's a Request cache, so just an in-memory dict with no write-through to an external cache server.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got it, ty

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you referring to the existing code or to my code change here?
I have to say I was quite confused by the prefetch method as well - the line
get_cache(cls.CACHE_NAMESPACE)[cls.CACHE_KEY] = roles_by_user is a pretty weird looking way to write to a cache, especially since the RequestCache has explicit set operations.
I was wondering if this even works and if so I highly doubt that this is good practice.

However my change was only to change the datastructure in the cache for each user from a set to a dict, so I didn't want to touch this to avoid breaking anything. But I was going to ask you.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I asked in our python slack about it and they confirmed that the RequestCache is pretty much just a writable dict. It's still weird to me that you'd do this instead of using it's explicit setter method.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is very likely that this usage of RequestCache just predates RequestCache having a setter method.


for role in CourseAccessRole.objects.filter(user__in=users).select_related('user'):
ormsbee marked this conversation as resolved.
Show resolved Hide resolved
roles_by_user[role.user.id].add(role)
user_id = role.user.id
course_id = get_role_cache_key_for_course(role.course_id)

# Add role to the set in roles_by_user[user_id][course_id]
user_roles_set_for_course = roles_by_user[user_id][course_id]
user_roles_set_for_course.add(role)

users_without_roles = [u for u in users if u.id not in roles_by_user]
for user in users_without_roles:
roles_by_user[user.id] = set()
roles_by_user[user.id] = {}

@classmethod
def get_user_roles(cls, user):
Expand All @@ -83,15 +117,32 @@ def get_user_roles(cls, user):

class RoleCache:
"""
A cache of the CourseAccessRoles held by a particular user
A cache of the CourseAccessRoles held by a particular user.
Internal data structures should be accessed by getter and setter methods;
don't use `_roles_by_course_id` or `_roles` directly.
_roles_by_course_id: This is the data structure as saved in the RequestCache.
It contains all roles for a user as a dict that's keyed by course_id.
The key ROLE_CACHE_UNGROUPED_ROLES__KEY is used for all roles
that are not associated with a course.
_roles: This is a set of all roles for a user, ungrouped. It's used for some types of
lookups and collected from _roles_by_course_id on initialization
so that it doesn't need to be recalculated.

"""
def __init__(self, user):
try:
self._roles = BulkRoleCache.get_user_roles(user)
self._roles_by_course_id = BulkRoleCache.get_user_roles(user)
except KeyError:
self._roles = set(
CourseAccessRole.objects.filter(user=user).all()
)
self._roles_by_course_id = {}
roles = CourseAccessRole.objects.filter(user=user).all()
for role in roles:
course_id = get_role_cache_key_for_course(role.course_id)
if not self._roles_by_course_id.get(course_id):
self._roles_by_course_id[course_id] = set()
self._roles_by_course_id[course_id].add(role)
self._roles = set()
for roles_for_course in self._roles_by_course_id.values():
self._roles.update(roles_for_course)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Question: Shouldn't this kind of refresh logic be delegated to something in BulkRoleCache?

Copy link
Member Author

@jesperhodge jesperhodge Apr 26, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe it should. The BulkRoleCache is responsible for roles in general it seems and this is scoped to a specific user's roles, so you could maybe make some case for it to stay here.

The refresh happened here originally as well, so I tried to change as few things as I could to avoid breaking stuff. With your guidance here I could see about moving this logic to the BulkRoleCache in some way, potentially, but maybe that's rather part of a separate need to work on the RoleCache in general:

Generally I wasn't very impressed with the design of the RolesCache and BulkRoleCache, given that

  • the performance was not optimal, as I've explained
  • the separation between RolesCache and BulkRolesCache is not very clear since as you said, maybe this refresh logic belongs in the BulkRoleCache
  • I'm not quite sure why you have a RequestCache with a BulkRoleCache but then you instead cache a single user's roles in the _roles property of the RoleCache here instead of the RequestCache. Is the RequestCache somehow slower, or why not just fetch the roles you want from the RequestCache and maybe avoid the whole problem we're having?
  • the private data attribute _roles was accessed directly instead of by a getter in some external code which means that changing it means you may break places in the codebase you aren't aware of, which is why I tried to not change that attribute's internal data structure and why I was so concerned about changing any code here
  • The roleCache is initialized and saved in a user's _.roles attribute, named the same as the roleCache's internal data attribute _roles, so you'll need to do something like user_roles = current_user._roles._roles which is extremely confusing
  • Outside of this implementation, it's anyway very weird how the RoleCache is randomly saved as an attribute a user has. The way it's done is just that some code somewhere suddenly adds this attribute _roles to a user object, which is not a set of roles but instead an instance of RoleCache. The problem besides the naming is that that's not a standard attribute a user has and you'll just go through code line by line to see if the code somewhere instantiates a RoleCache and adds it as such an attribute to a user. It's completely intransparent whether the user object will have an attached RoleCache or not at a given moment in time
  • The BulkRoleCache's prefetch method uses very weird logic to set or overwrite the RequestCache that completely ignores the RequestCache's setter methods

I could probably find some more things that bother me.
Now a few things of these I'm attempting to fix here but I'm not trying to fix the overall design that shows the problems you see above. I'll try to keep these two things separate, which is why I probably wouldn't move the refresh logic here to the BulkRoleCache.

It probably would make sense to improve the architecture of this RoleCache in a separate issue, though. I'd love to see that.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not quite sure why you have a RequestCache with a BulkRoleCache but then you instead cache a single user's roles in the _roles property of the RoleCache here instead of the RequestCache. Is the RequestCache somehow slower, or why not just fetch the roles you want from the RequestCache and maybe avoid the whole problem we're having?

I'd have to dig into the commit history, but keep in mind that things like this sometimes spring up as a series of bandaids that deal with specific bugs/issues over time, and don't necessarily add up to a coherent design intent. (Though sometimes they do, and those can be mind-bending. 😛 )

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Outside of this implementation, it's anyway very weird how the RoleCache is randomly saved as an attribute a user has. The way it's done is just that some code somewhere suddenly adds this attribute _roles to a user object, which is not a set of roles but instead an instance of RoleCache. The problem besides the naming is that that's not a standard attribute a user has and you'll just go through code line by line to see if the code somewhere instantiates a RoleCache and adds it as such an attribute to a user. It's completely intransparent whether the user object will have an attached RoleCache or not at a given moment in time

I think this is part of the early days of fumbling around as to how to cache something at the request level. There's no real built-in thing for this in Django, but there is a User object that's passed into the view as part of the Request, and that Request can be munged with in middleware. So if you want to stick some data onto a user for the duration of the request, you can slap it in there. Not something I'd recommend as a good practice, but I'd bet that's where it started from.


@staticmethod
def get_roles(role):
Expand All @@ -100,16 +151,24 @@ def get_roles(role):
"""
return ACCESS_ROLES_INHERITANCE.get(role, set()) | {role}

@property
def all_roles_set(self):
return self._roles

@property
def roles_by_course_id(self):
return self._roles_by_course_id

def has_role(self, role, course_id, org):
"""
Return whether this RoleCache contains a role with the specified role
or a role that inherits from the specified role, course_id and org.
"""
course_id_string = get_role_cache_key_for_course(course_id)
course_roles = self._roles_by_course_id.get(course_id_string, [])
return any(
access_role.role in self.get_roles(role) and
access_role.course_id == course_id and
access_role.org == org
for access_role in self._roles
access_role.role in self.get_roles(role) and access_role.org == org
for access_role in course_roles
)


Expand Down
84 changes: 80 additions & 4 deletions common/djangoapps/student/tests/test_roles.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import ddt
from django.test import TestCase
from opaque_keys.edx.keys import CourseKey
from opaque_keys.edx.locator import LibraryLocator

from common.djangoapps.student.roles import (
CourseAccessRole,
Expand All @@ -22,7 +23,9 @@
OrgContentCreatorRole,
OrgInstructorRole,
OrgStaffRole,
RoleCache
RoleCache,
get_role_cache_key_for_course,
ROLE_CACHE_UNGROUPED_ROLES__KEY
)
from common.djangoapps.student.role_helpers import get_course_roles, has_staff_roles
from common.djangoapps.student.tests.factories import AnonymousUserFactory, InstructorFactory, StaffFactory, UserFactory
Expand All @@ -35,7 +38,7 @@ class RolesTestCase(TestCase):

def setUp(self):
super().setUp()
self.course_key = CourseKey.from_string('edX/toy/2012_Fall')
self.course_key = CourseKey.from_string('course-v1:course-v1:edX+toy+2012_Fall')
self.course_loc = self.course_key.make_usage_key('course', '2012_Fall')
self.anonymous_user = AnonymousUserFactory()
self.student = UserFactory()
Expand Down Expand Up @@ -189,8 +192,9 @@ def test_get_orgs_for_user(self):
@ddt.ddt
class RoleCacheTestCase(TestCase): # lint-amnesty, pylint: disable=missing-class-docstring

IN_KEY = CourseKey.from_string('edX/toy/2012_Fall')
NOT_IN_KEY = CourseKey.from_string('edX/toy/2013_Fall')
IN_KEY_STRING = 'course-v1:edX+toy+2012_Fall'
IN_KEY = CourseKey.from_string(IN_KEY_STRING)
NOT_IN_KEY = CourseKey.from_string('course-v1:edX+toy+2013_Fall')

ROLES = (
(CourseStaffRole(IN_KEY), ('staff', IN_KEY, 'edX')),
Expand Down Expand Up @@ -233,3 +237,75 @@ def test_only_in_role(self, role, target):
def test_empty_cache(self, role, target): # lint-amnesty, pylint: disable=unused-argument
cache = RoleCache(self.user)
assert not cache.has_role(*target)

def test_get_role_cache_key_for_course_for_course_object_gets_string(self):
"""
Given a valid course key object, get_role_cache_key_for_course
should return the string representation of the key.
"""
course_string = 'course-v1:edX+toy+2012_Fall'
key = CourseKey.from_string(course_string)
key = get_role_cache_key_for_course(key)
assert key == course_string

def test_get_role_cache_key_for_course_for_undefined_object_returns_default(self):
"""
Given a value None, get_role_cache_key_for_course
should return the default key for ungrouped courses.
"""
key = get_role_cache_key_for_course(None)
assert key == ROLE_CACHE_UNGROUPED_ROLES__KEY

def test_role_cache_get_roles_set(self):
"""
Test that the RoleCache.all_roles_set getter method returns a flat set of all roles for a user
and that the ._roles attribute is the same as the set to avoid legacy behavior being broken.
"""
lib0 = LibraryLocator.from_string('library-v1:edX+quizzes')
course0 = CourseKey.from_string('course-v1:edX+toy+2012_Summer')
course1 = CourseKey.from_string('course-v1:edX+toy2+2013_Fall')
role_library_v1 = LibraryUserRole(lib0)
role_course_0 = CourseInstructorRole(course0)
role_course_1 = CourseInstructorRole(course1)

role_library_v1.add_users(self.user)
role_course_0.add_users(self.user)
role_course_1.add_users(self.user)

cache = RoleCache(self.user)
assert cache.has_role('library_user', lib0, 'edX')
assert cache.has_role('instructor', course0, 'edX')
assert cache.has_role('instructor', course1, 'edX')

assert len(cache.all_roles_set) == 3
roles_set = cache.all_roles_set
for role in roles_set:
assert role.course_id.course in ('quizzes', 'toy2', 'toy')

assert roles_set == cache._roles # pylint: disable=protected-access

def test_role_cache_roles_by_course_id(self):
"""
Test that the RoleCache.roles_by_course_id getter method returns a dictionary of roles for a user
that are grouped by course_id or if ungrouped by the ROLE_CACHE_UNGROUPED_ROLES__KEY.
"""
lib0 = LibraryLocator.from_string('library-v1:edX+quizzes')
course0 = CourseKey.from_string('course-v1:edX+toy+2012_Summer')
course1 = CourseKey.from_string('course-v1:edX+toy2+2013_Fall')
role_library_v1 = LibraryUserRole(lib0)
role_course_0 = CourseInstructorRole(course0)
role_course_1 = CourseInstructorRole(course1)
role_org_staff = OrgStaffRole('edX')

role_library_v1.add_users(self.user)
role_course_0.add_users(self.user)
role_course_1.add_users(self.user)
role_org_staff.add_users(self.user)

cache = RoleCache(self.user)
roles_dict = cache.roles_by_course_id
assert len(roles_dict) == 4
assert roles_dict.get(ROLE_CACHE_UNGROUPED_ROLES__KEY).pop().role == 'staff'
assert roles_dict.get('library-v1:edX+quizzes').pop().course_id.course == 'quizzes'
assert roles_dict.get('course-v1:edX+toy+2012_Summer').pop().course_id.course == 'toy'
assert roles_dict.get('course-v1:edX+toy2+2013_Fall').pop().course_id.course == 'toy2'
Loading