Initial commit

This commit is contained in:
xinyangli 2023-12-19 14:39:13 +08:00
commit 866fc4bb28
22 changed files with 3839 additions and 0 deletions

63
scripts/gen_meta.py Normal file
View file

@ -0,0 +1,63 @@
import json
import cv2
import pprint
import os.path as osp
from cdf_parser import CdfParser
print("""
<html>
<head>
<style>
.row {
display: flex;
flex-direction: row;
# flex-wrap: wrap;
justify-content: start;
align-items: center;
overflow-x: auto;
margin-bottom: 16px;
align-items: start;
}
.column {
flex: 25%;
padding: 0 1px;
}
</style>
</head>
<body>
""")
path = "https://internblob.blob.core.windows.net/v-lixinyang/canva-render-11.30/{}?sp=racwdli&st=2023-09-17T15:37:58Z&se=2023-12-31T23:37:58Z&spr=https&sv=2022-11-02&sr=c&sig=u%2FPbZ4fNttAPeLj0NEEpX0eIgFcjhot%2Bmy3iGd%2BCmxk%3D"
with open("cdfs.json", "r") as f:
for line in f:
cdf = json.loads(line)
id = cdf['rendered_folder']
cdf_parser = CdfParser(cdf['content'], id)
elements = cdf_parser.get_elements()
print('<div class="row">')
elements = [e for e in elements[::-1]]
for index, element in enumerate(elements):
name = 'full' if index == 0 else f"({index - 1})"
element_text = json.dumps(element, indent=2).replace("\n", "<br/>").replace(" ", "&nbsp;"*2)
print(f"""
<div class="column">
<img src="{path.format(id + f"-{name}.png")}" alt="image" style="width: 300px;">
<br/>
<img src="{path.format(id + f"-({index})-mask.png")}" alt="image" style="width: 300px;">
<p style="word-wrap: break-word; max-height: 300px; max-width: 300px; overflow: auto;"> {element_text} </p>
</div>
""")
print(f"""
<div class="column">
<img src="{path.format(id + f"-({len(elements) - 1}).png")}" alt="image" style="width: 300px;">
<br/>
<p style="word-wrap: break-word; max-height: 300px; overflow: auto;"> Background </p>
</div>
""")
print('</div>')
print("""
</body>
</html>
""")

22
scripts/jobgen.py Normal file

File diff suppressed because one or more lines are too long

174
scripts/post_processing.py Normal file
View file

@ -0,0 +1,174 @@
import logging
import asyncio
import time
from collections import Counter
import cv2
import numpy as np
from azure.storage.blob.aio import BlobServiceClient, download_blob_from_url
from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorCursor, AsyncIOMotorCollection, AsyncIOMotorDatabase
AZUREBLOB_SAS_URL = "https://internblob.blob.core.windows.net/v-lixinyang/?sp=racwdli&st=2023-09-17T15:37:58Z&se=2023-12-31T23:37:58Z&spr=https&sv=2022-11-02&sr=c&sig=u%2FPbZ4fNttAPeLj0NEEpX0eIgFcjhot%2Bmy3iGd%2BCmxk%3D"
CONTAINER = "canva-render-10.19"
MONGODB_URI = "mongodb://localhost:27017/canva"
class BlobAsync(object):
async def readall(self, blob):
blob_service_client = BlobServiceClient(AZUREBLOB_SAS_URL)
async with blob_service_client:
container_client = blob_service_client.get_container_client(CONTAINER)
# async for bname in container_client.list_blob_names():
# print(bname)
blob_client = container_client.get_blob_client(blob)
if not await blob_client.exists():
return None
stream = await blob_client.download_blob()
return stream.readall()
async def open_image(self, blob: str):
async with BlobServiceClient(AZUREBLOB_SAS_URL) as blob_service_client:
container_client = blob_service_client.get_container_client(CONTAINER)
# async for bname in container_client.list_blob_names():
# print(bname)
blob_client = container_client.get_blob_client(blob)
if not await blob_client.exists():
return None
stream = await blob_client.download_blob()
buf = np.frombuffer(await stream.readall(), dtype=np.uint8)
image = cv2.imdecode(buf, cv2.IMREAD_COLOR)
await blob_client.close()
await container_client.close()
return image
async def upload_image(self, blob, image):
async with BlobServiceClient(AZUREBLOB_SAS_URL) as blob_service_client:
# Instantiate a new ContainerClient
container_client = blob_service_client.get_container_client(CONTAINER)
blob_client = container_client.get_blob_client(blob)
is_success, buffer = cv2.imencode('.png', image)
await blob_client.upload_blob(data=buffer.tobytes(), overwrite=True)
await blob_client.close()
await container_client.close()
async def get_mask(img1, img2):
"""Assume img1 and img2 are exactly the same, except text areas
"""
try:
diff = cv2.absdiff(img1, img2)
except:
raise ValueError("img1 and img2 are not the same size")
mask = cv2.cvtColor(diff, cv2.COLOR_RGBA2GRAY)
thresh, binmask= cv2.threshold(mask, 10, 255, cv2.THRESH_BINARY)
return thresh, binmask
async def filter_mask_size(mask, thresh=0.4):
non_zero_pixels = cv2.countNonZero(mask)
total_pixels = mask.shape[0] * mask.shape[1]
if non_zero_pixels > total_pixels * thresh:
return True
else:
return False
mask_filtered_count = Counter()
async def process_cdf(blob: BlobAsync, collection, cdf):
folder = cdf["rendered_folder"]
async with asyncio.TaskGroup() as g:
task1 = g.create_task(blob.open_image(f"{folder}/t=true.png"))
task2 = g.create_task(blob.open_image(f"{folder}/t=false.png"))
img1, img2 = task1.result(), task2.result()
if img1 is None and img2 is None:
mask_filtered_count["not found"] += 1
await collection.update_one({"_id": cdf["_id"]}, {"$set": {"last_mask_render": -1, "failed_reason": "not found both"}})
return
if img1 is None:
mask_filtered_count["not found"] += 1
await collection.update_one({"_id": cdf["_id"]}, {"$set": {"last_mask_render": -1, "failed_reason": "not found t=true"}})
return
if img2 is None:
mask_filtered_count["not found"] += 1
await collection.update_one({"_id": cdf["_id"]}, {"$set": {"last_mask_render": -1, "failed_reason": "not found t=false"}})
return
try:
binary_thresh, mask = await get_mask(img1, img2)
except ValueError as e:
await collection.update_one({"_id": cdf["_id"]}, {"$set": {"last_mask_render": -1, "failed_reason": "size not match"}})
mask_filtered_count["size not match"] += 1
return
mask_filters = [
(filter_mask_size, "mask too small")
]
tasks = list()
async with asyncio.TaskGroup() as g:
for f, reason in mask_filters:
tasks.append((g.create_task(f(mask)), reason))
for task, reason in tasks:
if task.result():
mask_filtered_count[reason] += 1
await collection.update_one({"_id": cdf["_id"]}, {"$set": {"last_mask_render": -1, "failed_reason": reason}, "$unset": {"last_fetched": -1}})
return
await blob.upload_image(f"{folder}/mask.png", mask)
await collection.update_one({"_id": cdf["_id"]}, {"$set": {"last_mask_render": time.time()}})
mask_filtered_count["success"] += 1
async def main():
client = AsyncIOMotorClient(MONGODB_URI)
db = client.get_database("canva")
collection = db["cdf"]
logger = logging.getLogger('azure.mgmt.resource')
logger.setLevel(logging.WARNING)
blob = BlobAsync()
cdf_cursor: AsyncIOMotorCursor = collection.find({
'$or': [
{ '$and': [
{ 'rendered_folder': { '$exists': True } },
{ 'last_fetched': { '$gt': 1697688216 } },
{ 'last_fetched': { '$lt': time.time() - 600 } },
{ 'last_mask_render': { '$exists': False }}
]},
{ '$and': [
{ 'last_fetched': {'$gt': 1697998932}},
{ 'last_mask_render': { '$not': { '$gt': 0 } } }
]}
]}, batch_size=400)
cdf_list = await cdf_cursor.to_list(length=200)
await cdf_cursor.close()
while cdf_list is not []:
async with asyncio.TaskGroup() as g:
taskset = set()
for cdf in cdf_list:
taskset.add(
g.create_task(process_cdf(blob, collection, cdf))
)
await asyncio.sleep(10)
cdf_cursor: AsyncIOMotorCursor = collection.find({
'$or': [
{ '$and': [
{ 'rendered_folder': { '$exists': True } },
{ 'last_fetched': { '$gt': 1697688216 } },
{ 'last_fetched': { '$lt': time.time() - 600 } },
{ 'last_mask_render': { '$exists': False }}
]},
{ '$and': [
{ 'last_fetched': {'$gt': 1697998932}},
{ 'last_mask_render': { '$not': { '$gt': 0 } } }
]}
]}, batch_size=400)
cdf_list = await cdf_cursor.to_list(length=200)
await cdf_cursor.close()
print(mask_filtered_count)
if __name__ == "__main__":
asyncio.run(main())

View file

@ -0,0 +1,33 @@
import argparse
import json
from re import L
import cv2
import pprint
import os.path as osp
from cdf_parser import CdfParser
from threading import Lock
from concurrent.futures import ThreadPoolExecutor
output_f = open("cdfs_with_masks.json", "a+")
output_lock = Lock()
def get_text_mask(line):
data = json.loads(line)
cdf_parser = CdfParser(data['content'], data['rendered_folder'])
elements = cdf_parser.get_texts()
data["text_layer"] = elements
del data["content"]
output_f.write(json.dumps(data) + "\n")
def main(cdf_file):
with open(cdf_file, 'r') as f:
for line in f:
with ThreadPoolExecutor(max_workers=24) as executor:
executor.submit(get_text_mask, line)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--cdf", type=str, required=True)
args = parser.parse_args()
main(args.cdf)

65
scripts/vis.py Normal file
View file

@ -0,0 +1,65 @@
import json
from collections import Counter
SAS = "sp=racwdli&st=2023-09-17T15:37:58Z&se=2023-12-31T23:37:58Z&spr=https&sv=2022-11-02&sr=c&sig=u%2FPbZ4fNttAPeLj0NEEpX0eIgFcjhot%2Bmy3iGd%2BCmxk%3D" # e.g sp=r&st=...
with open("sample_cdfs.json", "r") as f:
data = []
for line in f:
data.append(json.load(f))
print("""
<html>
<head>
<style>
.row {
display: flex;
flex-direction: row;
justify-content: start;
align-items: center;
margin-bottom: 16px;
}
.column {
flex: 15%;
padding: 0 16px;
}
</style>
</head>
<body>
""")
for index, item in enumerate(data):
# NOTE: replace xxxxxxxxx with SAS!
file = f"https://internblob.blob.core.windows.net/v-lixinyang/canva-render-11.30/{item['rendered_folder']}-()?{SAS}"
if index % 1 == 0:
print('<div class="row">')
print(f"""
<div class="column">
<img src="{gt_file}" alt="image" style="max-width: 100%;">
</div>
<div class="column">
<img src="{file}" alt="image" style="max-width: 100%;">
</div>
<div class="column">
<img src="{if_file}" alt="image" style="max-width: 100%;">
</div>
<div class="column">
<img src="{sdxl_file}" alt="image" style="max-width: 100%;">
</div>
""")
if index % 1 == 0:
print('</div>')
print(f"""
<div class="row">
<p>Index: {index} <br>
Category: {item["category"]} <br>
{item["caption"]} <br>
Tags: {item["tags"]} <br>
Texts: {item["texts"]}</p>
</div>
""")
print("""
</body>
</html>
""")