Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Switch to py4j #5

Merged
merged 7 commits into from
Dec 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
__pycache__
bin/Lomikel*.jar
52 changes: 9 additions & 43 deletions apps/utils/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,9 @@
# limitations under the License.
"""Utilities to work with the Fink HBase client"""

from py4j.java_gateway import JavaGateway
import os

import jpype
import jpype.imports
import numpy as np
import yaml

Expand All @@ -26,24 +25,6 @@
from line_profiler import profile


@profile
def initialise_jvm(path=None):
"""Start a JVM

Parameters
----------
path: str, optional
Path to the HBase client. Default is relative to apps/
"""
if not jpype.isJVMStarted():
if path is None:
path = os.path.dirname(apps_loc) + "/../bin/FinkBrowser.exe.jar"
jarpath = f"-Djava.class.path={path}"
jpype.startJVM(jpype.getDefaultJVMPath(), "-ea", jarpath, convertStrings=True)

jpype.attachThreadToJVM()


@profile
def connect_to_hbase_table(
tablename: str,
Expand All @@ -62,35 +43,24 @@ def connect_to_hbase_table(
Name of the rowkey in the table containing the schema. Default is given by the config file.
nlimit: int, optional
Maximum number of objects to return. Default is 10000
setphysicalrepo: bool, optional
If True, store cutouts queried on disk ("/tmp/Lomikel/HBaseClientBinaryDataRepository")
Needs client 02.01+. Default is False
config_path: str, optional
Path to the config file. Default is None (relative to the apps/ folder)
"""
initialise_jvm()

if config_path is None:
config_path = os.path.dirname(apps_loc) + "/../config.yml"
args = yaml.load(
open(config_path),
yaml.Loader,
)

import com.Lomikel.HBaser
from com.astrolabsoftware.FinkBrowser.Utils import Init

Init.init()

client = com.Lomikel.HBaser.HBaseClient(args["HBASEIP"], args["ZOOPORT"])
gateway = JavaGateway(auto_convert=True)
client = gateway.jvm.com.Lomikel.HBaser.HBaseClient(
args["HBASEIP"], args["ZOOPORT"]
)

if schema_name is None:
schema_name = args["SCHEMAVER"]
client.connect(tablename, schema_name)
if setphysicalrepo:
import com.Lomikel.HBaser.FilesBinaryDataRepository

client.setRepository(com.Lomikel.HBaser.FilesBinaryDataRepository())
client.setLimit(args["NLIMIT"])

return client
Expand Down Expand Up @@ -131,21 +101,17 @@ def create_or_update_hbase_table(
if len(np.unique(families)) != 1:
raise NotImplementedError("`create_hbase_table` only accepts one family name")

initialise_jvm()

if config_path is None:
config_path = os.path.dirname(apps_loc) + "/../config.yml"
args = yaml.load(
open(config_path),
yaml.Loader,
)

import com.Lomikel.HBaser
from com.astrolabsoftware.FinkBrowser.Utils import Init

Init.init()

client = com.Lomikel.HBaser.HBaseClient(args["HBASEIP"], args["ZOOPORT"])
gateway = JavaGateway(auto_convert=True)
client = gateway.jvm.com.Lomikel.HBaser.HBaseClient(
args["HBASEIP"], args["ZOOPORT"]
)

if create:
# Create the table and connect without schema
Expand Down
13 changes: 5 additions & 8 deletions apps/utils/decoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
# limitations under the License.
"""Utilities to decode data from the HBase client"""

from py4j.java_gateway import JavaGateway
import json
import pandas as pd
import numpy as np

Expand Down Expand Up @@ -144,17 +146,12 @@ def format_hbase_output(
@profile
def hbase_to_dict(hbase_output):
"""Optimize hbase output TreeMap for faster conversion to DataFrame"""
# Naive Python implementation
# optimized = {i: dict(j) for i, j in hbase_output.items()}

# Here we assume JPype is already initialized
import json

from org.json import JSONObject
gateway = JavaGateway(auto_convert=True)
JSONObject = gateway.jvm.org.json.JSONObject

# We do bulk export to JSON on Java side to avoid overheads of iterative access
# and then parse it back to Dict in Python
optimized = json.loads(JSONObject(hbase_output).toString())
optimized = json.loads(JSONObject(str(hbase_output)).toString())

return optimized

Expand Down
Binary file removed bin/FinkBrowser.exe.jar
Binary file not shown.
25 changes: 25 additions & 0 deletions bin/download_client.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/bin/bash
# Copyright 2024 AstroLab Software
# Author: Julien Peloton
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
BASEURL=https://hrivnac.web.cern.ch/Activities/Packages/Lomikel
CLIENTVERSION=03.04.00x


files="Lomikel-$CLIENTVERSION-ext.jar Lomikel-$CLIENTVERSION.exe.jar Lomikel-$CLIENTVERSION-HBase.jar Lomikel-$CLIENTVERSION-HBase.exe.jar Lomikel-$CLIENTVERSION.jar"

for file in $files; do
echo $file
wget --directory-prefix=. $BASEURL/$file
done
Binary file added bin/py4j0.10.9.7.jar
Binary file not shown.
1 change: 1 addition & 0 deletions config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ CUTOUTAPIURL: http://localhost
# HBase configuration
HBASEIP: localhost
ZOOPORT: 2183
CLIENTVERSION: 03.04.00x

# Table schema (schema_{fink_broker}_{fink_science})
SCHEMAVER: schema_3.1_5.21.14
Expand Down
51 changes: 47 additions & 4 deletions install/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,58 @@ pip install -r requirements.txt

Follow instructions in the [fink-cutout-api](https://github.com/astrolabsoftware/fink-cutout-api/blob/main/install/README.md).

## Client installation and Fink gateway

To access HBase tables, we use a client based on [Lomikel](https://github.com/hrivnac/Lomikel). To download the latest version of the client, go to `bin` and execute:

```bash
cd bin
./download_client.sh
```

Do not forget to update the version in the `config.yml` file. Then install (as sudo) a new unit for systemd under `/etc/systemd/system/fink_gateway.service` (check the correct version numbers for JARs):

```bash
[Unit]
Description=Start a JVM with Fink Java objects
After=network.target

[Service]
User=almalinux
Group=almalinux
WorkingDirectory=/home/almalinux/fink-object-api/bin

ExecStart=/bin/sh -c 'source /home/almalinux/.bashrc; exec java -cp "Lomikel-03.04.00x-HBase.exe.jar:py4j0.10.9.7.jar" com.Lomikel.Py4J.LomikelGatewayServer 2>&1 >> /tmp/fink_gateway.out'

[Install]
WantedBy=multi-user.target
```

Reload daemon and start the service:

```bash
sudo systemctl daemon-reload
sudo systemctl start fink_gateway
```

Check carefuly the status:

```bash
sudo systemctl start fink_gateway
```

Note that having a JVM open all the time can lead to a memory leak, so it is probably wise to restart the service from time to time.


## Systemctl and gunicorn

Install a new unit for systemd under `/etc/systemd/system/fink_object_api.service`:
Install a new unit (as sudo) for systemd under `/etc/systemd/system/fink_object_api.service`:

```bash
[Unit]
Description=gunicorn daemon for fink_object_api
After=network.target
After=network.target fink_gateway.service fink_cutout_api.service
Requires=fink_gateway.service fink_cutout_api.service

[Service]
User=almalinux
Expand All @@ -41,5 +85,4 @@ sudo systemctl daemon-reload
sudo systemctl start fink_object_api
```


You are ready to use the API!
Note that this will automatically starts `fink_gateway.service` and `fink_cutout_api.service` if they were not started. You are ready to use the API!
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,6 @@ line_profiler
requests
pyarrow
matplotlib
JPype1
py4j
PyYAML
pyspark
Loading