diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..5c3bb29 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1 @@ +include cs_storage/templates/index.html \ No newline at end of file diff --git a/cs_storage/__init__.py b/cs_storage/__init__.py index f4bc990..09c0537 100644 --- a/cs_storage/__init__.py +++ b/cs_storage/__init__.py @@ -11,6 +11,8 @@ from marshmallow import Schema, fields, validate +from .screenshot import screenshot, ScreenshotError, SCREENSHOT_ENABLED + __version__ = "1.7.0" @@ -21,6 +23,7 @@ class Serializer: """ Base class for serializng input data to bytes and back. """ + def __init__(self, ext): self.ext = ext @@ -77,9 +80,24 @@ def get_serializer(media_type): class Output: """Output mixin shared among LocalOutput and RemoteOutput""" + id = fields.UUID(required=False) title = fields.Str() media_type = fields.Str( - validate=validate.OneOf(choices=["bokeh", "table", "CSV", "PNG", "JPEG", "MP3", "MP4", "HDF5", "PDF", "Markdown", "Text"]) + validate=validate.OneOf( + choices=[ + "bokeh", + "table", + "CSV", + "PNG", + "JPEG", + "MP3", + "MP4", + "HDF5", + "PDF", + "Markdown", + "Text", + ] + ) ) @@ -111,6 +129,28 @@ class LocalResult(Schema): downloadable = fields.Nested(LocalOutput, many=True) +def write_pic(fs, output): + if SCREENSHOT_ENABLED: + s = time.time() + try: + pic_data = screenshot(output) + except ScreenshotError: + print("failed to create screenshot for ", output["id"]) + return + else: + with fs.open(f"{BUCKET}/{output['id']}.png", "wb") as f: + f.write(pic_data) + f = time.time() + print(f"Pic write finished in {f-s}s") + else: + import warnings + + warnings.warn( + "Screenshot not enabled. Make sure you have installed " + "the optional packages listed in environment.yaml." + ) + + def write(task_id, loc_result, do_upload=True): fs = gcsfs.GCSFileSystem() s = time.time() @@ -124,17 +164,21 @@ def write(task_id, loc_result, do_upload=True): for output in loc_result[category]: serializer = get_serializer(output["media_type"]) ser = serializer.serialize(output["data"]) + output["id"] = str(uuid.uuid4()) filename = output["title"] if not filename.endswith(f".{serializer.ext}"): filename += f".{serializer.ext}" zipfileobj.writestr(filename, ser) rem_result[category]["outputs"].append( { + "id": output["id"], "title": output["title"], "media_type": output["media_type"], "filename": filename, } ) + if do_upload and category == "renderable": + write_pic(fs, output) zipfileobj.close() buff.seek(0) if do_upload: @@ -160,9 +204,12 @@ def read(rem_result, json_serializable=True): for rem_output in rem_result[category]["outputs"]: ser = get_serializer(rem_output["media_type"]) - rem_data = ser.deserialize(zipfileobj.read(rem_output["filename"]), json_serializable) + rem_data = ser.deserialize( + zipfileobj.read(rem_output["filename"]), json_serializable + ) read[category].append( { + "id": rem_output.get("id", None), "title": rem_output["title"], "media_type": rem_output["media_type"], "data": rem_data, @@ -171,3 +218,11 @@ def read(rem_result, json_serializable=True): f = time.time() print(f"Read finished in {f-s}s") return read + + +def add_screenshot_links(rem_result): + for rem_output in rem_result["renderable"]["outputs"]: + rem_output[ + "screenshot" + ] = f"https://storage.googleapis.com/{BUCKET}/{rem_output['id']}.png" + return rem_result diff --git a/cs_storage/screenshot.py b/cs_storage/screenshot.py new file mode 100644 index 0000000..f4c426a --- /dev/null +++ b/cs_storage/screenshot.py @@ -0,0 +1,111 @@ +import asyncio +import os +import tempfile + +try: + # These dependencies are optional. The storage component may be used + # without the screenshot component. + from jinja2 import Template + from pyppeteer import launch + from bokeh.resources import CDN + + BASE_ARGS = { + "bokeh_scripts": {"cdn_js": CDN.js_files[0], "widget_js": CDN.js_files[1]} + } + SCREENSHOT_ENABLED = True + +except ImportError: + SCREENSHOT_ENABLED = False + Template = None + launch = None + CDN = None + BASE_ARGS = {} + +import cs_storage + + +CURRENT_DIR = os.path.abspath(os.path.dirname(__file__)) + + +class ScreenshotError(Exception): + pass + + +def get_template(): + if not SCREENSHOT_ENABLED: + return None + with open(f"{CURRENT_DIR}/templates/index.html", "r") as f: + text = f.read() + + template = Template(text) + + return template + + +TEMPLATE = get_template() + + +def write_template(output): + kwargs = {**BASE_ARGS, **{"output": output}} + return TEMPLATE.render(**kwargs) + + +async def _screenshot(template_path, pic_path): + """ + Use pyppeteer, a python port of puppeteer, to open the + template at template_path and take a screenshot of the + output that is rendered within it. + + The output is rendered within a Bootstrap card element. + This element is only as big as the elements that it contains. + Thus, we only need to get the dimensions of the bootstrap + card to figure out which part of the screen we need to use + for the screenshot! + + Note: pyppetter looks stale. If it continues to not be + maintained well, then the extremely active, well-maintained + puppeteer should be used for creating these screenshots. The + downside of using puppeteer is that it is written in nodejs. + """ + browser = await launch(args=["--no-sandbox"]) + page = await browser.newPage() + await page.goto(f"file://{template_path}") + await page.setViewport(dict(width=1920, height=1080)) + await page.waitFor(1000) + element = await page.querySelector("#output") + if element is None: + raise ScreenshotError("Unable to take screenshot.") + boundingbox = await element.boundingBox() + clip = dict( + x=boundingbox["x"], + y=boundingbox["y"], + width=min(boundingbox["width"], 1920), + height=min(boundingbox["height"], 1080), + ) + await page.screenshot(path=f"{pic_path}", type="png", clip=clip) + await browser.close() + + +def screenshot(output, debug=False): + """ + Create screenshot of outputs. The intermediate results are + written to temporary files and a picture, represented as a + stream of bytes, is returned. + """ + if not SCREENSHOT_ENABLED: + return None + html = write_template(output) + with tempfile.NamedTemporaryFile(suffix=".html") as temp: + if debug: + with open(f'{output["title"]}.html', "w") as f: + f.write(html) + temp.write(html.encode("utf-8")) + temp.seek(0) + template_path = temp.name + with tempfile.NamedTemporaryFile(suffix=".png") as pic: + pic_path = pic.name + asyncio.get_event_loop().run_until_complete( + _screenshot(template_path, pic_path) + ) + pic_bytes = pic.read() + return pic_bytes diff --git a/cs_storage/templates/index.html b/cs_storage/templates/index.html new file mode 100644 index 0000000..f712ebe --- /dev/null +++ b/cs_storage/templates/index.html @@ -0,0 +1,60 @@ + + + + + Compute Studio + + + + + + + + + + + + + + + + +
+
+
+

{{output.title}}

+ {% if output.media_type == 'bokeh' %} +
{{output.id}} +
+
+ {% elif output.media_type == 'table' %} +
+ {{output.data|safe}} +
+ {% elif output.media_type == 'PNG' %} +
+ +
+ {% elif output.media_type == 'JPEG' %} +
+ +
+ {% endif %} +
+
+
+ + +{% if output.media_type == "bokeh" %} + +{% endif %} + + \ No newline at end of file diff --git a/cs_storage/tests/test-tb-remote.json b/cs_storage/tests/test-tb-remote.json new file mode 100644 index 0000000..795c223 --- /dev/null +++ b/cs_storage/tests/test-tb-remote.json @@ -0,0 +1,111 @@ +{ + "outputs": { + "renderable": { + "outputs": [ + { + "id": "ce297163-20cc-44a3-b565-9fa57bcef8ae", + "title": "", + "filename": ".json", + "media_type": "bokeh" + }, + { + "id": "66c7e9df-62f5-4199-a137-168bd32d245a", + "title": "Aggregate Results", + "filename": "Aggregate Results.json", + "media_type": "bokeh" + }, + { + "id": "5788882a-8c5f-4a90-bb40-1b1b24d72667", + "title": "Tables", + "filename": "Tables.json", + "media_type": "bokeh" + } + ], + "ziplocation": "0cde3e26-94ea-4483-814a-3aa397d9d5e6_renderable.zip" + }, + "downloadable": { + "outputs": [ + { + "id": "60d0f4b0-d687-4a1a-8ac3-d4cf44ec6158", + "title": "Total Liabilities Change by Calendar Year (Billions).csv", + "filename": "Total Liabilities Change by Calendar Year (Billions).csv", + "media_type": "CSV" + }, + { + "id": "92b7cb97-5086-4074-bab8-6936a24154ec", + "title": "Total Liabilities Baseline by Calendar Year (Billions).csv", + "filename": "Total Liabilities Baseline by Calendar Year (Billions).csv", + "media_type": "CSV" + }, + { + "id": "9549c414-92bb-463a-83cd-0f20f94cd98c", + "title": "Total Liabilities Reform by Calendar Year (Billions).csv", + "filename": "Total Liabilities Reform by Calendar Year (Billions).csv", + "media_type": "CSV" + }, + { + "id": "81cb43e9-b906-466f-83c6-7321a7372d8e", + "title": "Base plan tax vars, weighted total by expanded income bin (2019).csv", + "filename": "Base plan tax vars, weighted total by expanded income bin (2019).csv", + "media_type": "CSV" + }, + { + "id": "2a3387d8-3837-419f-bf9f-210e4883f7ea", + "title": "User plan tax vars, weighted total by expanded income bin (2019).csv", + "filename": "User plan tax vars, weighted total by expanded income bin (2019).csv", + "media_type": "CSV" + }, + { + "id": "b40043b0-79e6-4654-9ca9-ea3235421930", + "title": "Individual Income Tax: Difference between Base and User plans by expanded income bin (2019).csv", + "filename": "Individual Income Tax: Difference between Base and User plans by expanded income bin (2019).csv", + "media_type": "CSV" + }, + { + "id": "b8c3cafa-5e23-4cc1-b41a-11740718a14c", + "title": "Payroll Tax: Difference between Base and User plans by expanded income bin (2019).csv", + "filename": "Payroll Tax: Difference between Base and User plans by expanded income bin (2019).csv", + "media_type": "CSV" + }, + { + "id": "cec78b20-4137-4b71-8ada-f415791e88ae", + "title": "Combined Payroll and Individual Income Tax: Difference between Base and User plans by expanded income bin (2019).csv", + "filename": "Combined Payroll and Individual Income Tax: Difference between Base and User plans by expanded income bin (2019).csv", + "media_type": "CSV" + }, + { + "id": "47517e7d-e033-474b-9b69-97b6c19825fc", + "title": "Base plan tax vars, weighted total by expanded income decile (2019).csv", + "filename": "Base plan tax vars, weighted total by expanded income decile (2019).csv", + "media_type": "CSV" + }, + { + "id": "49122b30-0baf-4347-bfd1-77d4a1606dae", + "title": "User plan tax vars, weighted total by expanded income decile (2019).csv", + "filename": "User plan tax vars, weighted total by expanded income decile (2019).csv", + "media_type": "CSV" + }, + { + "id": "e2bd81b6-dccc-4df9-b0fe-ab4bbf1297f8", + "title": "Individual Income Tax: Difference between Base and User plans by expanded income decile (2019).csv", + "filename": "Individual Income Tax: Difference between Base and User plans by expanded income decile (2019).csv", + "media_type": "CSV" + }, + { + "id": "047baaff-f2a8-4b8f-b0de-be42ddbe9249", + "title": "Payroll Tax: Difference between Base and User plans by expanded income decile (2019).csv", + "filename": "Payroll Tax: Difference between Base and User plans by expanded income decile (2019).csv", + "media_type": "CSV" + }, + { + "id": "9f863037-390b-46c8-8971-ea22afcee630", + "title": "Combined Payroll and Individual Income Tax: Difference between Base and User plans by expanded income decile (2019).csv", + "filename": "Combined Payroll and Individual Income Tax: Difference between Base and User plans by expanded income decile (2019).csv", + "media_type": "CSV" + } + ], + "ziplocation": "0cde3e26-94ea-4483-814a-3aa397d9d5e6_downloadable.zip" + } + }, + "version": "v1" +} \ No newline at end of file diff --git a/cs_storage/tests/test-tc-remote.json b/cs_storage/tests/test-tc-remote.json new file mode 100644 index 0000000..3913a8d --- /dev/null +++ b/cs_storage/tests/test-tc-remote.json @@ -0,0 +1,57 @@ +{ + "outputs": { + "renderable": { + "outputs": [ + { + "id": "0469459a-82ab-4018-be30-3862a5383d4e", + "title": "Basic Liabilities", + "filename": "Basic Liabilities.html", + "media_type": "table" + }, + { + "id": "17162a2f-8951-4c8e-9795-44284e520f72", + "title": "Tax Liabilities by Wage (Holding Other Inputs Constant)", + "filename": "Tax Liabilities by Wage (Holding Other Inputs Constant).json", + "media_type": "bokeh" + }, + { + "id": "43b8914c-6832-4308-b4fa-6a13f899f239", + "title": "Tax Rates by Wage (Holding Other Inputs Constant)", + "filename": "Tax Rates by Wage (Holding Other Inputs Constant).json", + "media_type": "bokeh" + }, + { + "id": "51dccf29-f9b0-42a8-8c76-17ef6544ba47", + "title": "Tax Credits by Wage (Holding Other Inputs Constant)", + "filename": "Tax Credits by Wage (Holding Other Inputs Constant).json", + "media_type": "bokeh" + }, + { + "id": "435e4de6-23b4-4693-8a3a-2b7917190f09", + "title": "Calculation of Liabilities", + "filename": "Calculation of Liabilities.html", + "media_type": "table" + } + ], + "ziplocation": "0fe89f27-37f3-4342-95a6-ffa952e3d4c6_renderable.zip" + }, + "downloadable": { + "outputs": [ + { + "id": "8d34d210-f275-495b-839e-18acc1fc6d2a", + "title": "basic_table", + "filename": "basic_table.csv", + "media_type": "CSV" + }, + { + "id": "e1b80703-2088-4938-afa4-c3b49bb3142a", + "title": "calculation_table", + "filename": "calculation_table.csv", + "media_type": "CSV" + } + ], + "ziplocation": "0fe89f27-37f3-4342-95a6-ffa952e3d4c6_downloadable.zip" + } + }, + "version": "v1" +} \ No newline at end of file diff --git a/cs_storage/tests/test_cs_storage.py b/cs_storage/tests/test_cs_storage.py index cc95ade..bb1f5db 100644 --- a/cs_storage/tests/test_cs_storage.py +++ b/cs_storage/tests/test_cs_storage.py @@ -15,13 +15,14 @@ def png(): import matplotlib.pyplot as plt import numpy as np + x = np.linspace(0, 2, 100) plt.figure() - plt.plot(x, x, label='linear') - plt.plot(x, x**2, label='quadratic') - plt.plot(x, x**3, label='cubic') - plt.xlabel('x label') - plt.ylabel('y label') + plt.plot(x, x, label="linear") + plt.plot(x, x ** 2, label="quadratic") + plt.plot(x, x ** 3, label="cubic") + plt.xlabel("x label") + plt.ylabel("y label") plt.title("Simple Plot") plt.legend() initial_buff = io.BytesIO() @@ -34,13 +35,14 @@ def png(): def jpg(): import matplotlib.pyplot as plt import numpy as np + x = np.linspace(0, 2, 100) plt.figure() - plt.plot(x, x, label='linear') - plt.plot(x, x**2, label='quadratic') - plt.plot(x, x**3, label='cubic') - plt.xlabel('x label') - plt.ylabel('y label') + plt.plot(x, x, label="linear") + plt.plot(x, x ** 2, label="quadratic") + plt.plot(x, x ** 3, label="cubic") + plt.xlabel("x label") + plt.ylabel("y label") plt.title("Simple Plot") plt.legend() initial_buff = io.BytesIO() @@ -107,6 +109,7 @@ def test_get_serializer(): def test_cs_storage(png, jpg): + dummy_uuid = "c7a65ad2-0c2c-45d7-b0f7-d9fd524c49b3" exp_loc_res = { "renderable": [ { @@ -114,76 +117,76 @@ def test_cs_storage(png, jpg): "title": "bokeh plot", "data": {"html": "
", "javascript": "console.log('hello world')"}, }, - { - "media_type": "table", - "title": "table stuff", - "data": "", - }, - { - "media_type": "PNG", - "title": "PNG data", - "data": png, - }, - { - "media_type": "JPEG", - "title": "JPEG data", - "data": jpg, - }, - { - "media_type": "MP3", - "title": "MP3 data", - "data": b"MP3 bytes", - }, - - { - "media_type": "MP4", - "title": "MP4 data", - "data": b"MP4 bytes", - }, + {"media_type": "table", "title": "table stuff", "data": "
"}, + {"media_type": "PNG", "title": "PNG data", "data": png}, + {"media_type": "JPEG", "title": "JPEG data", "data": jpg}, + {"media_type": "MP3", "title": "MP3 data", "data": b"MP3 bytes"}, + {"media_type": "MP4", "title": "MP4 data", "data": b"MP4 bytes"}, ], "downloadable": [ - { - "media_type": "CSV", - "title": "CSV file", - "data": "comma,sep,values\n" - }, + {"media_type": "CSV", "title": "CSV file", "data": "comma,sep,values\n"}, { "media_type": "HDF5", "title": "HDF5 file", - "data": b"serialized numpy arrays and such\n" - }, - { - "media_type": "PDF", - "title": "PDF file", - "data": b"some pdf like data." + "data": b"serialized numpy arrays and such\n", }, + {"media_type": "PDF", "title": "PDF file", "data": b"some pdf like data."}, { "media_type": "Markdown", "title": "Markdown file", - "data": "**hello world**" - }, - { - "media_type": "Text", - "title": "Text file", - "data": "text data" + "data": "**hello world**", }, + {"media_type": "Text", "title": "Text file", "data": "text data"}, ], } - task_id = uuid.uuid4() + task_id = "1868c4a7-b03c-4fe4-ab45-0aa95c0bfa53" rem_res = cs_storage.write(task_id, exp_loc_res) loc_res = cs_storage.read(rem_res, json_serializable=False) - assert loc_res == exp_loc_res - assert json.dumps( - cs_storage.read(rem_res, json_serializable=True) + for output_type in ["renderable", "downloadable"]: + loc_res_without_id = [ + {k: v for k, v in output.items() if k != "id"} + for output in loc_res[output_type] + ] + exp_res_without_id = [ + {k: v for k, v in output.items() if k != "id"} + for output in exp_loc_res[output_type] + ] + assert exp_res_without_id == loc_res_without_id + + assert json.dumps(cs_storage.read(rem_res, json_serializable=True)) + + loc_res1 = cs_storage.read( + {"renderable": rem_res["renderable"]}, json_serializable=False ) - - loc_res1 = cs_storage.read({"renderable": rem_res["renderable"]}, json_serializable=False) - assert loc_res1["renderable"] == exp_loc_res["renderable"] + loc_res_without_id = [ + {k: v for k, v in output.items() if k != "id"} + for output in loc_res1["renderable"] + ] + exp_res_without_id = [ + {k: v for k, v in output.items() if k != "id"} + for output in exp_loc_res["renderable"] + ] + + assert exp_res_without_id == loc_res_without_id assert json.dumps( cs_storage.read({"renderable": rem_res["renderable"]}, json_serializable=True) ) +def test_add_screenshot_links(): + rem_res = {"renderable": {"outputs": [{"id": "1234"}, {"id": "4567"}]}} + + url = f"https://storage.googleapis.com/{cs_storage.BUCKET}/" + assert cs_storage.add_screenshot_links(rem_res) == { + "renderable": { + "outputs": [ + {"id": "1234", "screenshot": url + "1234.png"}, + {"id": "4567", "screenshot": url + "4567.png"}, + ] + } + } + + def test_errors(): with pytest.raises(exceptions.ValidationError): cs_storage.write("123", {"bad": "data"}) diff --git a/cs_storage/tests/test_screenshot.py b/cs_storage/tests/test_screenshot.py new file mode 100644 index 0000000..da62733 --- /dev/null +++ b/cs_storage/tests/test_screenshot.py @@ -0,0 +1,29 @@ +import json +import os + +import cs_storage + + +CURRENT_DIR = os.path.abspath(os.path.dirname(__file__)) + + +def test_taxcruncher_outputs(): + with open(f"{CURRENT_DIR}/test-tc-remote.json") as f: + remote_outputs = json.loads(f.read()) + outputs = cs_storage.read(remote_outputs["outputs"]) + + for output in outputs["renderable"]: + basename = f"{output['title'] or 'template'}.html" + print(f"screenshotting: {basename}") + cs_storage.screenshot(output) + + +def test_taxbrain_outputs(): + with open(f"{CURRENT_DIR}/test-tb-remote.json") as f: + remote_outputs = json.loads(f.read()) + outputs = cs_storage.read(remote_outputs["outputs"]) + + for output in outputs["renderable"]: + basename = f"{output['title'] or 'template'}.html" + print(f"screenshotting: {basename}") + cs_storage.screenshot(output) diff --git a/environment.yml b/environment.yml index 0d95114..75aea53 100644 --- a/environment.yml +++ b/environment.yml @@ -5,5 +5,10 @@ dependencies: - "marshmallow>=3.0.0" - pytest - gcsfs + - jinja2 # optional + - pyppeteer # optional + - bokeh # optional + - websockets # optional - matplotlib - numpy + - "pillow<7" \ No newline at end of file